def test_types(self): methylation = SiteType(name='methylation') p = Protein(refseq='NM_007', id=1, sequence='ABCD') db.session.add(p) site = Site(position=2, types={methylation}, residue='B', protein=p) db.session.add(site) db.session.commit() query = Protein.query assert query.filter(Protein.sites.any( Site.types.contains(methylation))).one() assert not query.filter( Protein.sites.any(~Site.types.contains(methylation))).all() assert Site.query.filter(Site.types.contains(methylation)).count() == 1 assert not Site.query.filter(~Site.types.contains(methylation)).all() phosphorylation = SiteType(name='phosphorylation') assert not query.filter( Protein.sites.any(Site.types.contains(phosphorylation))).all() assert query.filter( Protein.sites.any(~Site.types.contains(phosphorylation))).one() assert Site.query.filter( Site.types.contains(phosphorylation)).count() == 0
def test_sites(self): p = Protein(**test_protein_data()) sites = [ Site(position=3, residue='R', types={SiteType(name='phosphorylation')}), Site(position=4, residue='T', types={SiteType(name='methylation')}) ] db.session.add(p) p.sites = sites response = self.client.get('/protein/sites/NM_000123') assert response.status_code == 200 assert response.content_type == 'application/json' assert len(response.json) == 2 phosphorylation_site_repr = None for site_repr in response.json: print(site_repr) if site_repr['type'] == 'phosphorylation': phosphorylation_site_repr = site_repr assert phosphorylation_site_repr
def test_gather_residues(self): methylation = SiteType(name='methylation') p = Protein(refseq='NM_007', id=1, sequence='ABCD') sites = [ Site(position=2, types={methylation}, protein=p), # default -> 'B' Site(position=4, types={methylation}, residue='D'), ] db.session.add_all(sites) db.session.commit() assert methylation.find_modified_residues() == {'B', 'D'}
def glycosylations_without_subtype_ratio(self): from models import Site, SiteType glycosylation = SiteType.query.filter_by(name='glycosylation').one() glycosylations = Site.query.filter( SiteType.fuzzy_filter(glycosylation)).count() return (glycosylations - self.glycosylations_with_subtype()) / glycosylations
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') protein.gene.preferred_isoform = protein MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Sample A,Sample B', count=2) InheritedMutation(mutation=mutation, clin_data=[ ClinicalData(disease=Disease(name='Some disease'), sig_code=5), ClinicalData(disease=Disease(name='Other disease'), sig_code=2) ]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases={kinase}, pmid={1, 2}, types={SiteType(name='glycosylation')}) protein.sites = [site] return locals()
def ptm_muts_of_gene( path_template='exported/{site_type}_muts_of_{gene}_-_{protein}.tsv', gene='EGFR', site_type='glycosylation', mutation_source='mc3', to_csv=True, show_progress=False, **kwargs ): manager = MutationImportManager() importer_class = manager.importers[mutation_source] importer = importer_class(**kwargs) site_type = SiteType.query.filter_by(name=site_type).one() gene = Gene.query.filter_by(name=gene).one() protein = gene.preferred_isoform mutations = importer.export_to_df( mutation_filter=and_( Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type)), Mutation.protein_id == protein.id ), protein_filter=Protein.id == protein.id, show_progress=show_progress ) path = path_template.format(protein=protein.refseq, gene=gene.name, site_type=site_type.name) if to_csv: mutations.to_csv(path, sep='\t', index=False) return mutations
def __init__(self, **kwargs): filters = [ Filter( Mutation, 'sources', comparators=['in'], choices=list(source_manager.visible_fields.keys()), default=None, nullable=True, as_sqlalchemy=sqlalchemy_filter_from_source_name ), Filter( Site, 'types', comparators=['in'], choices={ site_type.name: site_type for site_type in SiteType.available_types() }, as_sqlalchemy=SiteType.fuzzy_filter, as_sqlalchemy_joins=[SiteType] ), Filter( Gene, 'has_ptm_muts', comparators=['eq'], as_sqlalchemy=lambda value: text('ptm_muts_cnt > 0') if value else text('true') ), Filter( Gene, 'is_known_kinase', comparators=['eq'], as_sqlalchemy=lambda value: Protein.kinase.any() ) ] + [ filter for filter in source_dependent_filters() if filter.has_sqlalchemy # filters without sqlalchemy interface are not supported for table views ] super().__init__(filters) self.update_from_request(request)
def test_search_mutations(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=7, sites=[s], sequence='XXXXXXXXXXXXV') m_in_site = Mutation(protein=p, position=13, alt='V') m_out_site = Mutation(protein=p, position=50, alt='K') db.session.add(p) # points to the same location as first record in VCF_FILE_CONTENT test_query = 'chr20 14370 G A' from database import bdb # map the first genomic mutation from VCF_FILE_CONTENT # to some (mocked) protein mutation bdb.add_genomic_mut('20', 14370, 'G', 'A', m_in_site, is_ptm=True) # # basic test - is appropriate mutation in results? # response = self.search_mutations(mutations=test_query) assert response.status_code == 200 # this mutation is exactly at a PTM site and should be included in results assert '<td>{0}</td>'.format(m_in_site.alt).encode() in response.data # this mutation lies outside of a PTM site - be default should be filtered out assert '<td>{0}</td>'.format(m_out_site.alt).encode() not in response.data # # count test - is mutation for this query annotated as shown twice? # response = self.search_mutations( mutations='{0}\n{0}'.format(test_query) ) assert response.status_code == 200 assert b'<td>2</td>' in response.data # # VCF file test # response = self.client.post( '/search/mutations', content_type='multipart/form-data', data={ 'vcf-file': (BytesIO(VCF_FILE_CONTENT), 'exemplar_vcf.vcf') } ) assert response.status_code == 200 assert b'NM_007' in response.data
def glycosylations_with_subtype(self): from models import Site, SiteType glycosylation_subtypes = [ type_id for type_name, type_id in SiteType.id_by_name().items() if 'glycosylation' in type_name and type_name != 'glycosylation' ] site_filter = Site.types.any(SiteType.id.in_(glycosylation_subtypes)) return Site.query.filter(site_filter).count()
def test_default_residue(self): p = Protein(refseq='NM_007', id=1, sequence='ABCD') methylation = SiteType(name='methylation') # note: for sites, positions are 1-based) site = Site(position=2, types={methylation}, protein=p) db.session.add(p) db.session.commit() assert site.residue == 'B'
def test_train_model(self): phosphorylation = SiteType(name='phosphorylation') # non-phosphorylated serine residues are needed to generate negative sites p = Protein(refseq='NM_007', sequence='--------SLPA-----------SVIT-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) # phosphorylated, with sites p = Protein(refseq='NM_001', sequence='--------SPAK-----------SPAR-------') g = Gene(isoforms=[p], preferred_isoform=p) db.session.add(g) k = Kinase(name='CDK1', is_involved_in={phosphorylation}) for pos in [9, 24]: s = Site(position=pos, types={phosphorylation}, residue='S', protein=p, kinases={k}) db.session.add(s) db.session.commit() with TemporaryDirectory() as temp_dir: model = train_model(phosphorylation, sequences_dir=temp_dir, sampling_n=2, threshold=2) # the model should have one set of params - for CDK1 kinase assert len(model) == 1 cdk_params = model.rx2('CDK1') pwm = cdk_params.rx2('pwm') # and the position-specific weight matrix should be created assert pwm # the very detailed testing should be performed by rMIMP, # but why not test the basics? weights_of_central_aa = { aa: value for aa, value in zip(pwm.rownames, pwm.rx(True, 8)) } assert weights_of_central_aa['S'] == max( weights_of_central_aa.values())
def most_mutated_sites(sources: List[MutationSource], site_type: SiteType = None, limit=25, intersection=True, exclusive=None, mutation_filter=None): """Sources must have the same value_type (counts/frequencies)""" assert not (intersection and exclusive) counts = prepare_for_summing(sources) query = (db.session.query( Site, *[count.label(f'count_{i}') for i, count in enumerate(counts)]).select_from(Mutation)) if intersection: for source in sources: query = query.join(source) else: for source in sources: query = query.outerjoin(source) if exclusive: query = query.filter(~Mutation.in_sources(*exclusive)) if mutation_filter is not None: query = query.filter(mutation_filter) query = (query.join(Mutation.affected_sites).filter( Site.protein.has(Protein.is_preferred_isoform))) if site_type: query = query.filter(SiteType.fuzzy_filter(site_type, join=True)) query = (query.group_by(Site).having(and_(*counts))) query = query.subquery() total_muts_count = reduce( operator.add, [getattr(query.c, f'count_{i}') for i in range(len(counts))]) total_muts_count = total_muts_count.label('mutations_count') query = (db.session.query( aliased(Site, query), total_muts_count, ).order_by(desc(total_muts_count))) return query.limit(limit)
def site_type_filter_from_str(query, site=Site): if query == 'any': return if query.startswith('not'): query = query[4:] negate = True else: negate = False site_type = SiteType.query.filter_by(name=query).one() site_filter = SiteType.fuzzy_filter(site_type, join=True, site=site) if negate: site_filter = ~site_filter return site_filter
def are_glycosylation_sites_mutated_more_often(source_name: str, disordered=None, alternative='greater'): from stats.table import count_mutated_sites glycosylation = SiteType.query.filter_by(name='glycosylation').one() non_glycosylation = SiteType.query.filter( ~SiteType.name.contains('glycosylation')).all() print(f'Comparing {glycosylation} against {non_glycosylation}') source = source_manager.source_by_name[source_name] count = partial(count_mutated_sites, model=source, only_primary=True, disordered=disordered) glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True) glycosylation_types = SiteType.query.filter( SiteType.name.contains('glycosylation')).all() non_glyco_filter = Site.types.any( ~SiteType.id.in_([site_type.id for site_type in glycosylation_types])) mutated_glycosylation = count(custom_filter=glyco_filter) mutated_non_glycosylation = count(custom_filter=non_glyco_filter) total_glycosylation = Site.query.filter(glyco_filter).count() total_non_glycosylation = Site.query.filter(non_glyco_filter).count() # mutated | not_mutated # glyc | # other | contingency_table = [ [mutated_glycosylation, total_glycosylation - mutated_glycosylation], [ mutated_non_glycosylation, total_non_glycosylation - mutated_non_glycosylation ] ] print(contingency_table) oddsratio, pvalue = fisher_exact(contingency_table, alternative=alternative) print(source_name, oddsratio, pvalue) return oddsratio, pvalue
def test_gather_negative_sites(self): p = Protein(refseq='NM_007', sequence='X---------X------------YXY--------') g = Gene(isoforms=[p], preferred_isoform=p) # one-based s = Site(position=11, types={SiteType(name='methylation')}, residue='X', protein=p) db.session.add_all([g, p, s]) negative_sites = gather_negative_sites(residues={'X'}, exclude={s}) # zero-based assert negative_sites == {NegativeSite(p, 0), NegativeSite(p, 24)}
def enrichment_of_ptm_genes(reference_set, site_type_name: str, only_mutated_sites=False): """ Args: only_mutated_sites: whether only genes with mutated sites should be considered, True, False or an SQLAlchemy filter, e.g. Mutation.in_sources(MC3Mutation) """ site_type = SiteType.query.filter_by(name=site_type_name).one() observed_genes = (Gene.query.join(Gene.preferred_isoform).join( Protein.sites).filter(SiteType.fuzzy_filter(site_type))) if only_mutated_sites is not False: observed_genes = observed_genes.join( Site.mutations).filter(only_mutated_sites) observed_genes = set(observed_genes) return genes_enrichment(observed_genes, reference_set)
def test_in_disordered(self): methylation = SiteType(name='methylation') p = Protein(refseq='NM_007', id=1, disorder_map='10000000001000000000000001', sequence='ABCDEFGHIJKLMNOPQRSTUVWXYZ') db.session.add(p) sites_in_disordered = {0, 10, 25} sites_not_disordered = {1, 9, 11, 24} sites = {} for position in sites_in_disordered | sites_not_disordered: print(position) print(len(p.sequence[position])) sites[position] = Site(position=position + 1, types={methylation}, residue=p.sequence[position], protein=p) # Python side for position in sites_in_disordered: site = sites[position] assert site.in_disordered_region for position in sites_not_disordered: site = sites[position] assert not site.in_disordered_region # SQL side assert { site.position - 1 for site in Site.query.filter_by(in_disordered_region=True) } == sites_in_disordered assert { site.position - 1 for site in Site.query.filter_by(in_disordered_region=False) } == sites_not_disordered
def are_glycosylation_sites_enriched(source_name: str, population_name: str, disordered=None, alternative='greater'): from stats.table import count_mutated_sites glycosylation = SiteType.query.filter_by(name='glycosylation').one() glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True) glycosylation_types = SiteType.query.filter( SiteType.name.contains('glycosylation')).all() non_glyco_filter = Site.types.any( ~SiteType.id.in_([site_type.id for site_type in glycosylation_types])) source = source_manager.source_by_name[source_name] population = source_manager.source_by_name[population_name] count_glyc = partial(count_mutated_sites, custom_filter=glyco_filter, only_primary=True, disordered=disordered) count_not_glyc = partial(count_mutated_sites, custom_filter=non_glyco_filter, only_primary=True, disordered=disordered) # mutated glyc | mutated not glyc # cancer | # popula | contingency_table = [[ count_glyc(model=source), count_not_glyc(model=source) ], [count_glyc(model=population), count_not_glyc(model=population)]] print(contingency_table) oddsratio, pvalue = fisher_exact(contingency_table, alternative=alternative) print(source_name, population_name, oddsratio, pvalue) return oddsratio, pvalue
def test_consistency(self): methylation = SiteType(name='methylation') p = Protein(refseq='NM_007', id=1, sequence='ABCD') db.session.add(p) # matching residue (note: for sites, positions are 1-based) assert Site(position=2, types={methylation}, residue='B', protein=p) # mismatched residue with pytest.raises(ValidationError): Site(position=3, types={methylation}, residue='B', protein=p) # no residue and position in range assert Site(position=2, protein=p) # no residue and position outside of range with pytest.raises(ValidationError): Site(position=5, protein=p) with pytest.raises(ValidationError): Site(position=-5, protein=p)
def show(self, refseq): """Show a protein by: + needleplot + tracks (sequence + data tracks) """ protein, filter_manager = self.get_protein_and_manager(refseq) user_datasets = current_user.datasets_names_by_uri() return template( 'protein/show.html', protein=protein, filters=filter_manager, widgets=create_widgets( protein, filter_manager.filters, custom_datasets_names=user_datasets.values()), site_types=['multi_ptm'] + SiteType.available_types(), mutation_types=Mutation.types, )
def common_filters(protein, default_source='MC3', source_nullable=False, custom_datasets_ids=[]): return [ Filter(Mutation, 'sources', comparators=['in'], choices=list(source_manager.visible_fields.keys()), default=default_source, nullable=source_nullable, as_sqlalchemy=sqlalchemy_filter_from_source_name), Filter(UserMutations, 'sources', comparators=['in'], choices=list(custom_datasets_ids), default=None, nullable=True), Filter(Mutation, 'is_ptm', comparators=['eq']), Filter(Drug, 'groups.name', comparators=['in'], nullable=False, choices=cached_queries.drug_groups, default=['approved'], multiple='all', as_sqlalchemy=True), Filter(Site, 'types', comparators=['in'], choices={ site_type.name: site_type for site_type in SiteType.available_types() }, custom_comparators={'in': SiteType.fuzzy_comparator}, as_sqlalchemy=SiteType.fuzzy_filter, as_sqlalchemy_joins=[Site.types]) ] + source_dependent_filters(protein)
def test_sequence(self): methylation = SiteType(name='methylation') p = Protein(refseq='NM_007', id=1, sequence='ABCDEFGHIJKLMNOPQRSTUVWXYZ') db.session.add(p) data = { 0: '-------ABCDEFGH', 10: 'DEFGHIJKLMNOPQR', 25: 'STUVWXYZ-------' } sites = {} for position in data: sites[position] = Site(position=position + 1, types={methylation}, residue=p.sequence[position], protein=p) db.session.add_all(sites.values()) db.session.commit() for position, expected_sequence in data.items(): site = sites[position] # Python side assert site.sequence == expected_sequence # SQL side assert Site.query.filter_by( sequence=expected_sequence).one() == site sequences = [ s for (s, ) in db.session.query(Site.sequence).select_from( Site).join(Protein) ] assert set(sequences) == set(data.values())
def test_edge_cases(self): protein = Protein( refseq='NM_006829', sequence= 'MASKGLQDLKQQVEGTAQEAVSAAGAAAQQVVDQATEAGQKAMDQLAKTTQETIDKTANQASDTFSGIGKKFGLLK*' ) db.session.add(protein) with initialized_importer(EdgeSitesCase, 'My_test') as importer: importer.site_datasets['My_test'] = 'mixed_type' importer.site_types_map['mixed_type'] = SiteType(name='mixed_type') sites = importer.load_sites(site_datasets=['My_test']) assert len(sites) == 2 sites_by_pos = {site.position: site for site in sites} assert sites_by_pos[3].residue == 'S' assert sites_by_pos[70].residue == 'K'
def get_site_filters(exclude=None, glycosylation='together'): """Yields (site type name, type filter) tuples. Args: exclude: site types to exclude glycosylation: 'together', 'only' or 'separate' """ for site_type in tqdm(SiteType.query, total=SiteType.query.count()): glyco_kind = 'glycosylation' in site_type.name type_name = site_type.name if glycosylation == 'together': if glyco_kind: if type_name == 'glycosylation': type_name = 'glycosylation (all subtypes)' else: continue elif glycosylation == 'only': if not glyco_kind: continue if type_name == 'glycosylation': type_name = 'glycosylation (unknown subtype)' if exclude and site_type.name in exclude: continue if glycosylation == 'together': site_filter = SiteType.fuzzy_filter(site_type, join=True) else: site_filter = Site.types.contains(site_type) yield type_name, site_filter
def train_model(site_type: SiteType, sequences_dir='.tmp', sampling_n=10000, enzyme_type='kinase', output_path=None, **kwargs): """Train MIMP model for given site type. NOTE: Natively MIMP works on phosphorylation sites only, so a special, forked version [reimandlab/rmimp] is needed for this function to work at all. Args: site_type: Type of the site for which the model is to be trained sequences_dir: path to dir where sequences for trainModel should be dumped sampling_n: number of sampling iterations for negative sequence set output_path: path to .mimp file where the model should be saved **kwargs: will be passed to trainModel Returns: trained MIMP model for all kinases affecting sites of given SiteType """ if not output_path: output_path = f'{site_type.name}.mimp' mimp = load_mimp() sites_of_this_type = set(site_type.sites) modified_residues = site_type.find_modified_residues() negative_sites = gather_negative_sites(modified_residues, exclude=sites_of_this_type) sequences_path = Path(sequences_dir) positive_path = sequences_path / 'positive' negative_path = sequences_path / 'negative' for path in [positive_path, negative_path]: shutil.rmtree(str(path), ignore_errors=True) path.mkdir(parents=True) if enzyme_type == 'kinase': enzymes = Kinase.query.filter( Kinase.is_involved_in.any(SiteType.name == site_type.name)).filter( Kinase.sites.any(Site.types.contains(site_type))) enzymes = tqdm(enzymes, total=enzymes.count()) elif enzyme_type == 'catch-all': enzymes = [ SimpleNamespace(sites=Site.query.filter( Site.types.contains(site_type)), name=f'all_enzymes_for_{site_type.name}') ] else: assert False for enzyme in enzymes: sites = [site for site in enzyme.sites if site_type in site.types] positive_sequences = [site.sequence for site in sites] negative_sequences = sample_random_negative_sequences( negative_sites, sampling_n) save_kinase_sequences(enzyme, positive_sequences, positive_path) save_kinase_sequences(enzyme, negative_sequences, negative_path) priors = mimp.PRIORS.rx2('human') # just in case # r.debug(mimp.trainModel) return mimp.trainModel( str(positive_path), str(negative_path), file=output_path, priors=priors, # or calculate_background_frequency(), # both give the same values (within rounding error), the custom # func might come in handy in future residues_groups=residues_groups(site_type, modified_residues), **kwargs)
def test_counting(self): motifs_db = { # xation happens whenever there is X which is not preceded with or followed by another X 'xation': { 'canonical': '.{6}[^X]X[^X].{6}', 'non-canonical': 'XXY' } } p = Protein(refseq='NM_007', id=1, sequence='_X_X_______X________XXY') mutations = [ Mutation(protein=p, position=1, alt='X'), # proximal, breaking Mutation(protein=p, position=1, alt='o'), # proximal, non-breaking Mutation(protein=p, position=2, alt='Y'), # direct, breaking Mutation(protein=p, position=3, alt='X'), # proximal for two sites, breaking ] xation = SiteType(name='xation') canonical_sites = [ Site(protein=p, position=2, types={xation}), # canonical, seriously mutated and broken Site(protein=p, position=4, types={xation}), # canonical, mutated Site(protein=p, position=12, types={xation}), # canonical, not mutated ] other_sites = [ Site(protein=p, position=22, types={xation}), # non-canonical motif, not mutated ] all_sites = canonical_sites + other_sites db.session.add(p) db.session.commit() counter = MotifsCounter(xation, motifs_db=motifs_db) counts = counter.count_muts_and_sites(Mutation.query, Site.query) assert counts.muts_around_sites_with_motif['canonical'] == 4 assert counts.muts_breaking_sites_motif['canonical'] == 3 assert counts.sites_with_broken_motif['canonical'] == 2 assert counts.sites_with_motif['canonical'] == len(canonical_sites) assert counts.sites_with_broken_motif['non-canonical'] == 0 assert counts.muts_around_sites_with_motif['non-canonical'] == 0 x_motifs = motifs_db['xation'] selection = select_sites_with_motifs(Site.query, x_motifs) assert selection['canonical'] == set(canonical_sites) assert select_sites_with_motifs(all_sites, x_motifs) == selection data = counter.gather_muts_and_sites(Mutation.query, Site.query) assert data.sites_with_broken_motif['canonical'] == { canonical_sites[0], canonical_sites[1] } assert data.sites_with_motif['canonical'] == set(canonical_sites)
comments.append( f'{breaking_muts} mutations breaking this motif ' f'({muts_percentage:.2f}% of PTM muts close to that motif).' f'<br>' f'{broken_sites} sites with broken motif ({sites_percentage:.2f}% of sites with this motif).' if broken_sites else None) data[motif] = genes_ordered, y, comments return data sources_combinations = [[InheritedMutation], [InheritedMutation, MC3Mutation], [MC3Mutation]] motifs_cases = cases(site_type=[SiteType(name='glycosylation')], sources=sources_combinations, count_method=['occurrences', 'distinct']).set_mode('product') def calc_motifs(sources, site_type, count_method, y_axis: str): kwargs = {} if count_method == 'occurrences': kwargs['occurrences_in'] = sources if len(sources) > 1: kwargs['intersection'] = sources counts_by_gene = count_by_sources(sources, site_type, by_genes=True, **kwargs)
def test_mutation(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=1, sites=[s], sequence='A' * 15, gene=Gene(name='SomeGene')) db.session.add(p) from database import bdb muts = {13: 14370, 15: 14376} for aa_pos, dna_pos in muts.items(): muts[aa_pos] = Mutation(protein=p, position=aa_pos, alt='V') bdb.add_genomic_mut('20', dna_pos, 'G', 'A', muts[aa_pos], is_ptm=True) query_url = '/chromosome/mutation/{chrom}/{pos}/{ref}/{alt}' # query as a novel mutation response = self.client.get( query_url.format(chrom='chr20', pos=14370, ref='G', alt='A')) assert response.status_code == 200 assert response.json == [{ 'alt': 'V', 'gene': 'SomeGene', 'in_datasets': {}, 'pos': 13, 'ptm_impact': 'direct', 'cnt_ptm': 1, 'closest_sites': ['13 A'], 'protein': 'NM_007', 'sites': [{ 'kinases': [], 'position': 13, 'residue': 'A', 'kinase_groups': [], 'type': 'methylation' }], 'ref': 'A' }] # well let's look on a known mutation: m = muts[15] mc3 = MC3Mutation(mutation=m, cancer=Cancer(name='Breast invasive carcinoma', code='BRCA'), count=1) esp = ExomeSequencingMutation(mutation=m, maf_all=0.02, maf_aa=0.02) db.session.add_all([m, mc3, esp]) db.session.commit() mutation_a15v_query = query_url.format(chrom='chr20', pos=14376, ref='G', alt='A') response = self.client.get(mutation_a15v_query) metadata = { 'MC3': { 'Cancers': [{ 'Cancer': 'Breast invasive carcinoma', 'Value': 1 }] }, 'ESP6500': { 'MAF': 0.02, 'MAF AA': 0.02, 'MAF EA': None } } assert response.json[0]['in_datasets'] == metadata expected_values = {'MC3': 1, 'ESP6500': 0.02} # if user does not want to download data for all datasets he may use: for source, meta in metadata.items(): response = self.client.get(mutation_a15v_query + '?filters=Mutation.sources:in:' + source) json = response.json[0] assert json['in_datasets'] == {source: meta} assert json['value'] == expected_values[source] response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:MC3;Mutation.mc3_cancer_code:in:BRCA' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:African American' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:European American' ) assert not response.json
def create_network(): p = create_test_protein() cancer = Cancer(name='Ovarian', code='OV') known_interactor_of_x = create_test_kinase('Kinase Y', 'NM_0009') kinase_mutation = Mutation( position=1, alt='T', meta_MC3=[MC3Mutation(cancer=cancer)] ) known_interactor_of_x.protein.mutations = [kinase_mutation] drug = Drug( name='Drug targeting ' + known_interactor_of_x.name, drug_bank_id='DB01', target_genes=[known_interactor_of_x.protein.gene], # by default only approved drugs are shown groups={DrugGroup(name='approved')} ) group = KinaseGroup( name='Group of kinases', ) phosphorylation = SiteType(name='phosphorylation') s = Site( position=1, types={phosphorylation}, residue='T', kinases={known_interactor_of_x}, kinase_groups={group} ) s2 = Site( position=2, types={phosphorylation}, residue='R', kinase_groups={group} ) p.sites = [s, s2] predicted_interactor = create_test_kinase('Kinase Z', 'NM_0002') protein_mutation = Mutation( position=2, alt='T', meta_MC3=[MC3Mutation(cancer=cancer)], meta_MIMP=[ MIMPMutation(pwm=known_interactor_of_x.name, effect='loss', site=s, probability=0.1, position_in_motif=1), MIMPMutation(pwm=predicted_interactor.name, effect='gain', site=s, probability=0.1, position_in_motif=1) ] ) p.mutations = [protein_mutation] db.session.add_all([p, drug, predicted_interactor]) db.session.commit() # a new cancer was added, reload is necessary (this should not happen during normal app usage) from website.views.filters import cached_queries cached_queries.reload()
def ptm_on_random(source=MC3Mutation, site_type='glycosylation', same_proteins=False, only_preferred=True, mode='occurrences', repeats=10000, ptm_proteins=False, same_ptm_proteins=False, exclude_genes=None, mutation_filter=None, sample_ptm_muts=True): """"Compare frequencies of PTM mutations of given type with random proteome mutations from protein sequence regions of the same size as analysed PTM regions. """ from numpy import sum from numpy import zeros from numpy.random import choice assert mode in {'distinct', 'occurrences'} distinct = mode == 'distinct' assert not (same_proteins and ptm_proteins) assert not (same_ptm_proteins and not ptm_proteins) def measure(x): """See https://github.com/taschini/pyinterval/issues/2""" return int(fpu.up(lambda: sum((c.sup - c.inf for c in x), 0))) site_type = SiteType.query.filter_by(name=site_type).one() only_preferred = Protein.is_preferred_isoform if only_preferred else True # all muts all_muts = defaultdict(lambda: defaultdict(int)) q = (db.session.query( source, Mutation).select_from(source).join(Mutation).filter( mutation_filter if mutation_filter is not None else True).join( Protein).filter(only_preferred)) if distinct: for mutation_details, mutation in tqdm(q, total=q.count()): all_muts[mutation.protein][mutation.position] += 1 else: for mutation_details, mutation in tqdm(q, total=q.count()): if mutation.position > mutation.protein.length: print(f'Faulty mutation: {mutation}') continue all_muts[mutation.protein][ mutation.position] += mutation_details.count # region size glyco_sequence_region_size = 0 intervals_by_protein = defaultdict(interval) sites = (Site.query.filter( SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred)) for site in tqdm(sites, total=sites.count()): intervals_by_protein[site.protein] |= interval[ max(site.position - 7, 0), min(site.position + 7, site.protein.length)] for sequence_interval in intervals_by_protein.values(): glyco_sequence_region_size += measure(sequence_interval) # ptm muts ptm_muts = (db.session.query( source, Mutation ).select_from(source).join(Mutation).join( Mutation.affected_sites ).filter(mutation_filter if mutation_filter is not None else True).filter( SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred)) if exclude_genes: exclude_proteins = Protein.query.select_from(Gene).join( Gene.isoforms).filter(Gene.name.in_(exclude_genes)).all() ptm_muts = ptm_muts.filter( ~Protein.id.in_([p.id for p in exclude_proteins])) ptm_muts = ptm_muts.group_by(source) if sample_ptm_muts: ptm_muts_by_protein = defaultdict(list) for mutation_details, mutation in ptm_muts: ptm_muts_by_protein[mutation.protein].append( (mutation_details, mutation)) ptm_mutations_array = zeros(glyco_sequence_region_size) pos = 0 for protein, protein_interval in tqdm(intervals_by_protein.items(), total=len(intervals_by_protein)): for mutation_details, mutation in ptm_muts_by_protein[protein]: p = 0 for subinterval in protein_interval.components: if mutation.position in subinterval: p += mutation.position - 1 - int( subinterval[0].inf) # position in interval break p += measure(subinterval) if distinct: ptm_mutations_array[pos + p] += 1 else: ptm_mutations_array[pos + p] += mutation_details.count pos += measure(protein_interval) ptm_counts = [] for repeat in tqdm(range(repeats), total=repeats): ptm_counts.append( sum( choice(ptm_mutations_array, size=glyco_sequence_region_size))) print(Series(ptm_counts).describe()) ptm_muts_count = mean(ptm_counts) else: if distinct: ptm_muts_count = ptm_muts.count() else: ptm_muts_count = 0 for mutation, mutation in ptm_muts: assert mutation.count != 0 ptm_muts_count += mutation.count assert ptm_muts_count >= ptm_muts.count() ptm_counts = [ptm_muts_count] ptm_ratio = ptm_muts_count / glyco_sequence_region_size if same_proteins: proteins = list(intervals_by_protein.keys()) else: proteins = Protein.query.filter(only_preferred) if ptm_proteins: proteins = proteins.join(Protein.sites) if same_ptm_proteins: proteins = proteins.filter(SiteType.fuzzy_filter(site_type)) proteins = proteins.all() proteins = [ protein for protein in proteins if not exclude_genes or protein.gene.name not in exclude_genes ] weights = [p.length for p in proteins] mutations_array = zeros(sum(weights)) pos = 0 for protein in tqdm(proteins): for position, count in all_muts[protein].items(): try: mutations_array[pos + position - 1] = count except Exception: print(protein, pos, position) raise pos += protein.length counts = [] append = counts.append for repeat in tqdm(range(repeats), total=repeats): append(sum(choice(mutations_array, size=glyco_sequence_region_size))) p_value = sum(1 for count in counts if count > ptm_muts_count) / repeats count_of_sampled_muts = mean(counts) random_ratio = count_of_sampled_muts / glyco_sequence_region_size explanation = '(only the same proteins)' if same_proteins else '' print( f'count of {site_type.name} mutations: {ptm_muts_count},\n' f'count of random mutations from protein sequence regions of the same size: {count_of_sampled_muts}' f' {explanation}.') print(f'region size: {glyco_sequence_region_size}; source: {source}') print(f'frequency of {site_type.name} mutations: {ptm_ratio * 100}%,\n' f'frequency of random mutations: {random_ratio * 100}%.') print(f'p-value = {p_value} (permutation test, {repeats} repeats)') print('Permutation test values:') print(Series(counts).describe()) return ptm_counts, counts, glyco_sequence_region_size, p_value