def site_specific_network_of_kinases_and_targets(f): header = [ 'kinase symbol', 'target symbol', 'kinase refseq', 'target refseq', 'target sequence position', 'target amino acid' ] f.write('\t'.join(header) + '\n') for protein in tqdm(Protein.query, total=fast_count(Protein.query)): for site in protein.sites: for kinase in site.kinases: data = [ kinase.name, protein.gene.name, kinase.protein.refseq if kinase.protein else '', protein.refseq, site.position, site.residue ] f.write('\t'.join(map(str, data)) + '\n')
def mutations_affecting_ptm_sites(f, sources): header = [ 'gene', 'refseq', 'mutation position', 'mutation alt', 'mutation summary', 'site position', 'site residue' ] f.write('\t'.join(header) + '\n') for source in sources: mutation_details_model = source for mut_details in tqdm(yield_objects(mutation_details_model.query), total=fast_count(mutation_details_model.query)): mutation = mut_details.mutation if mutation.is_ptm(): for site in mutation.get_affected_ptm_sites(): protein = mutation.protein summary = mut_details.summary() data = [ protein.gene.name, protein.refseq, mutation.position, mutation.alt, ', '.join(summary) if type(summary) is list else summary, site.position, site.residue ] f.write('\t'.join(map(str, data)) + '\n')
def export(self, path=None, only_primary_isoforms=False): """Export all mutations from this source in ActiveDriver compatible format. Source specific data export can be implemented with export_details method, while export_details_headers should provide names for respective headers. """ from tqdm import tqdm tick = 0 if not path: path = self.generate_export_path(only_primary_isoforms) header = [ 'gene', 'isoform', 'position', 'wt_residue', 'mut_residue' ] + self.export_details_headers() with gzip.open(path, 'wt') as f: f.write('\t'.join(header)) for mutation in tqdm(yield_objects(self.model.query), total=fast_count(db.session.query(self.model))): tick += 1 m = mutation.mutation if only_primary_isoforms and not m.protein.is_preferred_isoform: continue dataset_specific = self.export_details(mutation) try: ref = m.ref except IndexError: print( 'Mutation: %s %s %s is exceeding the proteins sequence' % (m.protein.refseq, m.position, m.alt) ) ref = '' for instance in dataset_specific: data = [ m.protein.gene.name, m.protein.refseq, str(m.position), ref, m.alt ] + instance f.write('\n' + '\t'.join(data)) del data del mutation if tick % 10000 == 0: import gc gc.collect()
def interactions(self): return (fast_count( db.session.query(models.Site).join( models.Kinase, models.Site.kinases)) + fast_count( db.session.query(models.Site).join( models.KinaseGroup, models.Site.kinase_groups)))
def kinase_groups_covered(self): return fast_count( db.session.query(models.KinaseGroup).filter( models.KinaseGroup.sites.any()))
def confirmed_mutations_count(self): return fast_count(self.confirmed_mutations)
def count(self): return fast_count(self.query)
def export(self, path=None, only_primary_isoforms=False): """Export all mutations from this source in ActiveDriver compatible format. Source specific data export can be implemented with export_details method, while export_details_headers should provide names for respective headers. """ from datetime import datetime import os from tqdm import tqdm export_time = datetime.utcnow() tick = 0 if not path: directory = os.path.join('exported', 'mutations') os.makedirs(directory, exist_ok=True) name_template = '{model_name}{restrictions}-{date}.tsv.gz' name = name_template.format( model_name=self.model_name, restrictions=('-primary_isoforms_only' if only_primary_isoforms else ''), date=export_time) path = os.path.join(directory, name) header = ['gene', 'isoform', 'position', 'wt_residue', 'mut_residue' ] + self.export_details_headers() with gzip.open(path, 'wt') as f: f.write('\t'.join(header)) for mutation in tqdm(yield_objects(self.model.query), total=fast_count(db.session.query( self.model))): tick += 1 m = mutation.mutation if only_primary_isoforms and not m.protein.is_preferred_isoform: continue dataset_specific = self.export_details(mutation) try: ref = m.ref except IndexError: print( 'Mutation: %s %s %s is exceeding the proteins sequence' % (m.protein.refseq, m.position, m.alt)) ref = '' for instance in dataset_specific: data = [ m.protein.gene.name, m.protein.refseq, str(m.position), ref, m.alt ] + instance f.write('\n' + '\t'.join(data)) del data del mutation if tick % 10000 == 0: import gc gc.collect()