def create_construct_data(self, filenames=False): self.logger.info('ADDING EXPERIMENTAL CONSTRUCT DATA') # read source files if not filenames: filenames = os.listdir(self.construct_data_dir) for filename in filenames: if filename[-4:]!='json': continue filepath = os.sep.join([self.construct_data_dir, filename]) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) structures = Structure.objects.all() for s in structures: pdbname = str(s) try: protein = Protein.objects.filter(entry_name=pdbname.lower()).get() d = fetch_pdb_info(pdbname,protein) add_construct(d) except: print(pdbname,'failed') self.logger.info('COMPLETED CREATING EXPERIMENTAL CONSTRUCT DATA')
def create_construct_data(self, filenames=False): self.logger.info('ADDING EXPERIMENTAL CONSTRUCT DATA') # read source files if not filenames: filenames = os.listdir(self.construct_data_dir) for filename in filenames: if filename[-4:] != 'json': continue filepath = os.sep.join([self.construct_data_dir, filename]) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) structures = Structure.objects.all() for s in structures: pdbname = str(s) try: protein = Protein.objects.filter( entry_name=pdbname.lower()).get() d = fetch_pdb_info(pdbname, protein) add_construct(d) except: print(pdbname, 'failed') self.logger.info('COMPLETED CREATING EXPERIMENTAL CONSTRUCT DATA')
def replace_deletions(self): # delete alle deletions # ConstructDeletion.objects.all().delete() for c in Construct.objects.all(): pdbname = c.structure.pdb_code.index # if not pdbname in ['5F8U','2VT4']: # continue # print(pdbname) #reset caches c.schematics = None c.snakecache = None c.save() c.deletions.all().delete() pdbname = c.structure.pdb_code.index cname = c.name protein = Protein.objects.filter(entry_name=pdbname.lower()).get() uniprot = protein.parent.entry_name d = cache.get(pdbname+"_auto_d") # d = None if not d and 'deletions' in d: d = fetch_pdb_info(pdbname,protein) cache.set(pdbname+"_auto_d",d,60*60*24) if 'deletions' in d: for d in d['deletions']: dele, created = ConstructDeletion.objects.get_or_create(construct=c, start=d['start'],end=d['end']) else: print('No deletions in d[]',pdbname)
def replace_deletions(self): # delete alle deletions # ConstructDeletion.objects.all().delete() for c in Construct.objects.all(): pdbname = c.structure.pdb_code.index # if not pdbname in ['5F8U','2VT4']: # continue # print(pdbname) #reset caches c.schematics = None c.snakecache = None c.save() c.deletions.all().delete() pdbname = c.structure.pdb_code.index cname = c.name protein = Protein.objects.filter(entry_name=pdbname.lower()).get() uniprot = protein.parent.entry_name d = cache.get(pdbname + "_deletions") # d = None if not d: d = fetch_pdb_info(pdbname, protein) cache.set(pdbname + "_deletions", d, 60 * 60 * 24) for d in d['deletions']: dele, created = ConstructDeletion.objects.get_or_create( construct=c, start=d['start'], end=d['end'])
def check_deletions(self): constructs = Construct.objects.all() csv_rows = [] for c in constructs: issues = [] pdbname = c.structure.pdb_code.index cname = c.name # if pdbname!='4GPO': # continue protein = Protein.objects.filter(entry_name=pdbname.lower()).get() uniprot = protein.parent.entry_name d = cache.get(pdbname + "_deletions") # d = None if not d: d = fetch_pdb_info(pdbname, protein) cache.set(pdbname + "_deletions", d, 60 * 60 * 24) pdb_deletions = [] for d in d['deletions']: pdb_deletions += range(d['start'], d['end'] + 1) cons_dels = c.deletions.all() db_deletions = [] for d in cons_dels: db_deletions += range(d.start, d.end + 1) present_in_pdb_only = set(pdb_deletions) - set(db_deletions) present_in_pdb_only_list = [] for k, g in groupby(enumerate(present_in_pdb_only), lambda x: x[0] - x[1]): group = list(map(itemgetter(1), g)) present_in_pdb_only_list.append([group[0], group[-1]]) present_in_db_only = set(db_deletions) - set(pdb_deletions) present_in_db_only_list = [] for k, g in groupby(enumerate(present_in_db_only), lambda x: x[0] - x[1]): group = list(map(itemgetter(1), g)) present_in_db_only_list.append([group[0], group[-1]]) if present_in_pdb_only or present_in_db_only: print(pdbname) if present_in_pdb_only: print("PDBONLY", present_in_pdb_only) if present_in_db_only: print("DBONLY", present_in_db_only) csv_rows.append([ pdbname, uniprot, cname, present_in_db_only_list, present_in_pdb_only_list, '' ]) import csv with open('construct_del_issues.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(csv_rows)
def create_construct_local_data(self, filenames=False): self.logger.info('ADDING EXPERIMENTAL CONSTRUCT DATA') #delete existing self.purge_construct_data() # read source files if not filenames: filenames = os.listdir(self.construct_data_dir) for filename in sorted(filenames): print('dealing with', filename) if filename[-4:] != 'json': continue filepath = os.sep.join([self.construct_data_dir, filename]) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) filenames = os.listdir(self.construct_data_local_dir) for filename in sorted(filenames): print('dealing with', filename) if filename[-4:] != 'json': continue filepath = os.sep.join([self.construct_data_local_dir, filename]) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) structures = Structure.objects.all() for s in structures: pdbname = str(s) try: exists = Construct.objects.filter( structure__pdb_code__index=pdbname).exists() if not exists: print(pdbname) protein = Protein.objects.filter( entry_name=pdbname.lower()).get() d = fetch_pdb_info(pdbname, protein) add_construct(d) else: print("Entry for", pdbname, "already there") except: print(pdbname, 'failed')
def create_construct_local_data(self, filenames=False): self.logger.info('ADDING EXPERIMENTAL CONSTRUCT DATA') #delete existing self.purge_construct_data() # read source files if not filenames: filenames = os.listdir(self.construct_data_dir) for filename in sorted(filenames): print('dealing with',filename) if filename[-4:]!='json': continue filepath = os.sep.join([self.construct_data_dir, filename]) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) filenames = os.listdir(self.construct_data_local_dir) for filename in sorted(filenames): print('dealing with',filename) if filename[-4:]!='json': continue filepath = os.sep.join([self.construct_data_local_dir, filename]) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) structures = Structure.objects.all() for s in structures: pdbname = str(s) try: exists = Construct.objects.filter(structure__pdb_code__index=pdbname).exists() if not exists: print(pdbname) protein = Protein.objects.filter(entry_name=pdbname.lower()).get() d = fetch_pdb_info(pdbname,protein) add_construct(d) else: print("Entry for",pdbname,"already there") except: print(pdbname,'failed')
def check_deletions(self): constructs = Construct.objects.all() csv_rows = [] for c in constructs: issues = [] pdbname = c.structure.pdb_code.index cname = c.name # if pdbname!='4GPO': # continue protein = Protein.objects.filter(entry_name=pdbname.lower()).get() uniprot = protein.parent.entry_name d = cache.get(pdbname+"_auto_d") # d = None if not d: d = fetch_pdb_info(pdbname,protein) cache.set(pdbname+"_auto_d",d,60*60*24) pdb_deletions = [] for d in d['deletions']: pdb_deletions += range(d['start'],d['end']+1) cons_dels = c.deletions.all() db_deletions = [] for d in cons_dels: db_deletions += range(d.start,d.end+1) present_in_pdb_only = set(pdb_deletions)-set(db_deletions) present_in_pdb_only_list = [] for k, g in groupby(enumerate(present_in_pdb_only), lambda x:x[0]-x[1]): group = list(map(itemgetter(1), g)) present_in_pdb_only_list.append([group[0], group[-1]]) present_in_db_only = set(db_deletions)-set(pdb_deletions) present_in_db_only_list = [] for k, g in groupby(enumerate(present_in_db_only), lambda x:x[0]-x[1]): group = list(map(itemgetter(1), g)) present_in_db_only_list.append([group[0], group[-1]]) if present_in_pdb_only or present_in_db_only: print(pdbname) if present_in_pdb_only: print("PDBONLY",present_in_pdb_only) if present_in_db_only: print("DBONLY",present_in_db_only) csv_rows.append([pdbname,uniprot,cname,present_in_db_only_list,present_in_pdb_only_list,'']) import csv with open('construct_del_issues.csv', 'w') as f: writer = csv.writer(f, delimiter = '\t') writer.writerows(csv_rows)
def create_construct_data(self, filenames=False): self.logger.info('ADDING EXPERIMENTAL CONSTRUCT DATA') # read source files do_all = False if not filenames: do_all = True # self.purge_construct_data() # filenames = os.listdir(self.construct_data_dir) if filenames: for filename in filenames: if filename[-4:] != 'json': continue filepath = os.sep.join([self.construct_data_dir, filename]) print('Adding ' + filepath) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) if do_all: structures = Structure.objects.all().exclude(refined=True) for s in structures: pdbname = str(s) try: exists = Construct.objects.filter( structure__pdb_code__index=pdbname).exists() if not exists: print(pdbname) protein = Protein.objects.filter( entry_name=pdbname.lower()).get() d = fetch_pdb_info(pdbname, protein) add_construct(d) else: # pass print("Entry for", pdbname, "already there") except: print(pdbname, 'failed') self.logger.info('COMPLETED CREATING EXPERIMENTAL CONSTRUCT DATA')
def create_construct_data(self, filenames=False): self.logger.info('ADDING EXPERIMENTAL CONSTRUCT DATA') # read source files do_all = False if not filenames: do_all = True # self.purge_construct_data() # filenames = os.listdir(self.construct_data_dir) if filenames: for filename in filenames: if filename[-4:]!='json': continue filepath = os.sep.join([self.construct_data_dir, filename]) print('Adding '+filepath) with open(filepath) as json_file: d = json.load(json_file) add_construct(d) if do_all: structures = Structure.objects.all().exclude(refined=True) for s in structures: pdbname = str(s) try: exists = Construct.objects.filter(structure__pdb_code__index=pdbname).exists() if not exists: # print(pdbname) protein = Protein.objects.filter(entry_name=pdbname.lower()).get() d = fetch_pdb_info(pdbname,protein) add_construct(d) else: # pass print("Entry for",pdbname,"already there") except: print(pdbname,'failed') self.logger.info('COMPLETED CREATING EXPERIMENTAL CONSTRUCT DATA')
def check_mutations(self): track_annotated_mutations = [] cached_mutations = {} for i, mut in enumerate(self.excel_mutations): # print("Progress ",i,len(self.excel_mutations)) # continue #print(mut) m = {} m['gn'] = mut[8] m['mut_aa'] = mut[11] m['wt_aa'] = mut[10] m['entry_name'] = mut[6] m['pos'] = int(mut[9]) m['pdb'] = mut[1] m['thermo_effect'] = mut[12] m['expression_effect'] = mut[13] m['site_effect'] = mut[14] m['site_effect_type'] = mut[15] m['other_effect'] = mut[16] # if m['entry_name']!='glp1r_human': # continue # print(m) if m['pdb'] and m['pdb'][0] != '%': pdbs = m['pdb'].split(',') # print(pdbs) if len(pdbs) == 0: cons = Construct.objects.filter( structure__pdb_code__index=m['pdb']) else: cons = Construct.objects.filter( structure__pdb_code__index__in=pdbs) else: cons = Construct.objects.filter( structure__protein_conformation__protein__parent__entry_name =m['entry_name']) pdbs_has = [] pdbs_hasnot = [] for c in cons: c_pdb = c.structure.pdb_code.index not_to_check = None if m['pdb'] and m['pdb'][0] == '%': # if there are some pdbs not to check on this uniport not_to_check = m['pdb'].replace("%", "").split(",") if c_pdb in not_to_check: continue # if not_to_check: # print(c_pdb,not_to_check,m['pdb'][0]) protein = Protein.objects.filter( entry_name=c_pdb.lower()).get() if c_pdb in cached_mutations: d = cached_mutations[c_pdb] else: d = cache.get(c_pdb + "_mutations") if not d: d = fetch_pdb_info(c_pdb, protein) cache.set(c_pdb + "_mutations", d, 60 * 60 * 24) cached_mutations[c_pdb] = d # Find construct mutation cons_muts = ConstructMutation.objects.filter( construct=c, sequence_number=m['pos'], mutated_amino_acid=m['mut_aa'], wild_type_amino_acid=m['wt_aa']) if not cons_muts.exists( ) and m['other_effect'] != 'Non-receptor' and m[ 'other_effect'] != 'Wrong annotation - remove!': # If no hits something is odd # print(c.structure.pdb_code.index,' do not have following mutation:',mut) found = False for pdb_m in d['mutations']: if int(pdb_m['pos']) == m['pos']: found = True break if found: # print('It was however found in pdb! ADDING') res_wt = Residue.objects.get( protein_conformation__protein=protein.parent, sequence_number=m['pos']) mut = ConstructMutation.objects.create( construct=c, sequence_number=m['pos'], wild_type_amino_acid=m['wt_aa'], mutated_amino_acid=m['mut_aa'], residue=res_wt) pdbs_has.append(c_pdb) else: # print('Was also not found in pdb!') pdbs_hasnot.append("%" + c_pdb) cons_muts_odd = ConstructMutation.objects.filter( construct=c, sequence_number=m['pos']) for cons_mut in cons_muts_odd: print(c_pdb, cons_mut) else: # print(c.structure.pdb_code.index,' HAS following mutation:',mut) pdbs_has.append(c_pdb) cons_muts = ConstructMutation.objects.filter( construct=c, sequence_number=m['pos'], mutated_amino_acid=m['mut_aa'], wild_type_amino_acid=m['wt_aa']) for cons_mut in cons_muts: if m['other_effect'] == 'Non-receptor' or m[ 'other_effect'] == 'Wrong annotation - remove!': # print('Delete!',cons_mut.construct.structure.pdb_code.index,cons_mut) cons_mut.delete() continue # Clear existing to replace with current cons_mut.effects.clear() if m['thermo_effect']: mutation_type, created = ConstructMutationType.objects.get_or_create( slug=slugify('Thermostabilising'), name='Thermostabilising', effect=m['thermo_effect']) cons_mut.effects.add(mutation_type) if m['expression_effect']: mutation_type, created = ConstructMutationType.objects.get_or_create( slug=slugify('Receptor Expression'), name='Receptor Expression', effect=m['expression_effect']) cons_mut.effects.add(mutation_type) if m['site_effect']: mutation_type, created = ConstructMutationType.objects.get_or_create( slug=slugify(m['site_effect']), name=m['site_effect'], effect=m['site_effect_type']) cons_mut.effects.add(mutation_type) if m['other_effect']: # print(m['other_effect']) mutation_type, created = ConstructMutationType.objects.get_or_create( slug=slugify('Other effect'), name='Other effect', effect=m['other_effect']) cons_mut.effects.add(mutation_type) track_annotated_mutations.append(cons_mut.pk) # if not m['pdb'] and len(pdbs_hasnot) and len(pdbs_has): # print(m['entry_name'],m['wt_aa']+str(m['pos'])+m['mut_aa']) # print("has",",".join(pdbs_has)) # print("hasnot",",".join(pdbs_hasnot)) # if not len(pdbs_has): # print('NOONE HAS',m['entry_name'],m['wt_aa']+str(m['pos'])+m['mut_aa']) print(len(track_annotated_mutations), 'annotated mutations') non_annotated_muts = ConstructMutation.objects.all().exclude( pk__in=track_annotated_mutations).order_by( 'construct__protein__entry_name', 'sequence_number') print(len(non_annotated_muts), 'non-annotated mutations') csv_rows = [[ 'reference', 'pdb', 'construct name', 'class', 'lig type', 'rec fam', 'uniprot', 'segment', 'gpcrdb#', 'AA no.', 'WT aa', 'Mut aa', '', '', '', '', '', 'Remark' ]] for mut in non_annotated_muts: pdb = mut.construct.structure.pdb_code.index uniprot = mut.construct.protein.entry_name seg = mut.residue.protein_segment.slug if mut.residue.generic_number: gn = mut.residue.generic_number.label else: gn = '' pos = mut.sequence_number wt_aa = mut.wild_type_amino_acid mut_aa = mut.mutated_amino_acid annotated_effect = [e.slug for e in mut.effects.all()] # print(annotated_effect) csv_rows.append([ '', pdb, '', '', '', '', uniprot, seg, gn, pos, wt_aa, mut_aa, '', '', '', '', '', ','.join(annotated_effect) ]) # print(csv_rows[-1]) # print(csv_rows) import csv with open('construct_mut_missing.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(csv_rows)
def match_all_with_uniprot_mutations(self): constructs = Construct.objects.all() csv_rows = [[ 'reference', 'pdb', 'construct name', 'class', 'lig type', 'rec fam', 'uniprot', 'segment', 'gpcrdb#', 'AA no.', 'WT aa', 'Mut aa', '', '', '', '', '', 'Remark' ]] for c in constructs: issues = [] pdbname = c.structure.pdb_code.index # if pdbname!='4GPO': # continue protein = Protein.objects.filter(entry_name=pdbname.lower()).get() uniprot = protein.parent.entry_name # if uniprot !='glp1r_human': # continue d = cache.get(pdbname + "_mutations") # d = None if not d: d = fetch_pdb_info(pdbname, protein) cache.set(pdbname + "_mutations", d, 60 * 60 * 24) # print('pdb',d['mutations']) cons_muts = ConstructMutation.objects.filter(construct=c) for m in cons_muts: seq_pos = m.sequence_number found = False for pdb_m in d['mutations']: if int(pdb_m['pos']) == seq_pos and pdb_m[ 'wt'] == m.wild_type_amino_acid and ( pdb_m['mut'] == m.mutated_amino_acid): found = True break if not found: ignore = False for m_xlx in self.excel_mutations: if m_xlx[6] == uniprot and int( m_xlx[9]) == seq_pos and m_xlx[ 11] == m.mutated_amino_acid and m_xlx[ 10] == m.wild_type_amino_acid: found = False if pdbname in m_xlx[1] or m_xlx[1] == '': found = True if '%' + pdbname in m_xlx[1]: found = False if found: if m_xlx[16] != 'Non-receptor' and m_xlx[ 16] != 'Wrong annotation - remove!': ignore = True if ignore: issues.append(('In excel but missing in pdb?', seq_pos, m.mutated_amino_acid)) else: issues.append(('Not in excel nor pdb, deleting', seq_pos, m.mutated_amino_acid)) m.delete() print(issues) continue issues.append( ('missing in pdb?', seq_pos, m.mutated_amino_acid)) mut_aa = m.mutated_amino_acid pos = m.sequence_number wt_aa = m.wild_type_amino_acid annotated_effect = "Not identified in PDB -- perhaps delete?" res = Residue.objects.get( protein_conformation__protein=protein.parent, sequence_number=pos) seg = res.protein_segment.slug if res.generic_number: gn = res.generic_number.label else: gn = '' csv_rows.append([ '', pdbname, '', '', '', '', protein.parent.entry_name, seg, gn, pos, wt_aa, mut_aa, '', '', '', '', '', annotated_effect ]) for m in d['mutations']: cons_muts = ConstructMutation.objects.filter( construct=c, sequence_number=m['pos'], mutated_amino_acid=m['mut'], wild_type_amino_acid=m['wt']) if not cons_muts.exists(): # print('missing',m) ignore = False for m_xlx in self.excel_mutations: if m_xlx[6] == uniprot and int(m_xlx[9]) == m['pos']: found = False if pdbname in m_xlx[1] or m_xlx[1] == '': found = True if '%' + pdbname in m_xlx[1]: found = False if found: if m_xlx[16] == 'Non-receptor' or m_xlx[ 16] == 'Wrong annotation - remove!': ignore = True if ignore: continue issues.append( ('{}{}{} ({})'.format(m['wt'], m['pos'], m['mut'], m['type']), ' not in db, nor to be ignored in excel')) mut_aa = m['mut'] pos = m['pos'] wt_aa = m['wt'] annotated_effect = m['type'] res = Residue.objects.get( protein_conformation__protein=protein.parent, sequence_number=pos) seg = res.protein_segment.slug if res.generic_number: gn = res.generic_number.label else: gn = '' csv_rows.append([ '', pdbname, '', '', '', '', protein.parent.entry_name, seg, gn, pos, wt_aa, mut_aa, '', '', '', '', '', annotated_effect ]) if issues: print(pdbname) for i in issues: print(i) import csv with open('construct_mut_issues.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(csv_rows)
def check_mutations(self): track_annotated_mutations = [] cached_mutations = {} mut_pdb_list = {} for i,mut in enumerate(self.excel_mutations): # print("Progress ",i,len(self.excel_mutations)) # continue #print(mut) m = {} m['gn'] = mut[8] m['mut_aa'] = mut[11] m['wt_aa'] = mut[10] m['entry_name'] = mut[6] m['pos'] = int(mut[9]) m['pdb'] = mut[1] m['thermo_effect'] = mut[12] m['expression_effect'] = mut[13] m['site_effect'] = mut[14] m['site_effect_type'] = mut[15] m['other_effect'] = mut[16] # if m['entry_name']!='glp1r_human': # continue # print(m) if m['pdb'] and m['pdb'][0]!='%': pdbs = m['pdb'].split(',') # print(pdbs) for pdb in pdbs: if pdb not in mut_pdb_list: mut_pdb_list[pdb] = [] if len(pdbs)==0: cons = Construct.objects.filter(structure__pdb_code__index = m['pdb']) else: cons = Construct.objects.filter(structure__pdb_code__index__in = pdbs) else: cons = Construct.objects.filter(structure__protein_conformation__protein__parent__entry_name = m['entry_name']) pdbs_has = [] pdbs_hasnot = [] for c in cons: c_pdb = c.structure.pdb_code.index if c_pdb not in mut_pdb_list: mut_pdb_list[c_pdb] = [] not_to_check = None if m['pdb'] and m['pdb'][0]=='%': # if there are some pdbs not to check on this uniport not_to_check = m['pdb'].replace("%","").split(",") for pdb in not_to_check: if pdb not in mut_pdb_list: mut_pdb_list[pdb] = [] if c_pdb in not_to_check: continue # if not_to_check: # print(c_pdb,not_to_check,m['pdb'][0]) protein = Protein.objects.filter(entry_name=c_pdb.lower()).get() # print(c_pdb,protein) if c_pdb in cached_mutations: d = cached_mutations[c_pdb] else: d = cache.get(c_pdb+"_auto_d") if not d: d = fetch_pdb_info(c_pdb,protein) cache.set(c_pdb+"_auto_d",d,60*60*24) cached_mutations[c_pdb] = d # Find construct mutation cons_muts = ConstructMutation.objects.filter(construct=c, sequence_number = m['pos'], mutated_amino_acid = m['mut_aa'], wild_type_amino_acid = m['wt_aa']) if not cons_muts.exists() and m['other_effect']!='Non-receptor' and m['other_effect']!='Wrong annotation - remove!': # If no hits something is odd # print(c.structure.pdb_code.index,' do not have following mutation:',mut) found = False for pdb_m in d['mutations']: if int(pdb_m['pos']) == m['pos']: found = True break if found: # print('It was however found in pdb! ADDING') res_wt = Residue.objects.get(protein_conformation__protein=protein.parent, sequence_number=m['pos']) mut = ConstructMutation.objects.create(construct=c, sequence_number=m['pos'],wild_type_amino_acid= m['wt_aa'],mutated_amino_acid=m['mut_aa'], residue=res_wt) pdbs_has.append(c_pdb) else: # print('Was also not found in pdb!') pdbs_hasnot.append("%"+c_pdb) cons_muts_odd = ConstructMutation.objects.filter(construct=c, sequence_number = m['pos']) for cons_mut in cons_muts_odd: print(c_pdb,cons_mut) else: # print(c.structure.pdb_code.index,' HAS following mutation:',mut) pdbs_has.append(c_pdb) cons_muts = ConstructMutation.objects.filter(construct=c, sequence_number = m['pos'], mutated_amino_acid = m['mut_aa'], wild_type_amino_acid = m['wt_aa']) for cons_mut in cons_muts: if m['other_effect']=='Non-receptor' or m['other_effect']=='Wrong annotation - remove!': # print('Delete!',cons_mut.construct.structure.pdb_code.index,cons_mut) cons_mut.delete() continue # Clear existing to replace with current cons_mut.effects.clear() if m['thermo_effect']: mutation_type, created = ConstructMutationType.objects.get_or_create(slug=slugify('Thermostabilising'),name='Thermostabilising', effect=m['thermo_effect']) cons_mut.effects.add(mutation_type) if m['expression_effect']: mutation_type, created = ConstructMutationType.objects.get_or_create(slug=slugify('Receptor Expression'),name='Receptor Expression', effect=m['expression_effect']) cons_mut.effects.add(mutation_type) if m['site_effect']: mutation_type, created = ConstructMutationType.objects.get_or_create(slug=slugify(m['site_effect']),name=m['site_effect'], effect=m['site_effect_type']) cons_mut.effects.add(mutation_type) if m['other_effect']: # print(m['other_effect']) mutation_type, created = ConstructMutationType.objects.get_or_create(slug=slugify('Other effect'),name='Other effect', effect=m['other_effect']) cons_mut.effects.add(mutation_type) track_annotated_mutations.append(cons_mut.pk) # if not m['pdb'] and len(pdbs_hasnot) and len(pdbs_has): # print(m['entry_name'],m['wt_aa']+str(m['pos'])+m['mut_aa']) # print("has",",".join(pdbs_has)) # print("hasnot",",".join(pdbs_hasnot)) # if not len(pdbs_has): # print('NOONE HAS',m['entry_name'],m['wt_aa']+str(m['pos'])+m['mut_aa']) print(len(track_annotated_mutations),'annotated mutations') non_annotated_muts = ConstructMutation.objects.all().exclude(pk__in=track_annotated_mutations).order_by('construct__protein__entry_name','sequence_number') print(len(non_annotated_muts),'non-annotated mutations') csv_rows = [['reference','pdb','construct name','class','lig type','rec fam','uniprot','segment','gpcrdb#','AA no.','WT aa','Mut aa','','','','','','Remark']] missing = list(set(self.all_pdbs) - set(mut_pdb_list.keys())) print(sorted(missing)," do not have any mutations annotated -- add them to sheet with NONE if they have none") missing2 = [] for c_pdb in missing: d = cache.get(c_pdb+"_auto_d") if not d: protein = Protein.objects.filter(entry_name=c_pdb.lower()).get() d = fetch_pdb_info(c_pdb,protein) cache.set(c_pdb+"_auto_d",d,60*60*24) if len(d['mutations']): missing2.append(c_pdb) print(sorted(missing2)," do not have any mutations annotated (but auto has them having) -- add them to sheet with NONE if they have none") for mut in non_annotated_muts: pdb = mut.construct.structure.pdb_code.index uniprot = mut.construct.protein.entry_name seg = mut.residue.protein_segment.slug if mut.residue.generic_number: gn = mut.residue.generic_number.label else: gn = '' pos = mut.sequence_number wt_aa = mut.wild_type_amino_acid mut_aa = mut.mutated_amino_acid annotated_effect = [e.slug for e in mut.effects.all()] # print(annotated_effect) csv_rows.append(['',pdb,'','','','',uniprot,seg,gn,pos,wt_aa,mut_aa,'','','','','',','.join(annotated_effect)]) # print(csv_rows[-1]) # print(csv_rows) import csv with open('construct_mut_missing.csv', 'w') as f: writer = csv.writer(f, delimiter = '\t') writer.writerows(csv_rows)
def match_all_with_uniprot_mutations(self): constructs = Construct.objects.all() csv_rows = [['reference','pdb','construct name','class','lig type','rec fam','uniprot','segment','gpcrdb#','AA no.','WT aa','Mut aa','','','','','','Remark']] for c in constructs: issues = [] pdbname = c.structure.pdb_code.index # if pdbname!='4GPO': # continue protein = Protein.objects.filter(entry_name=pdbname.lower()).get() uniprot = protein.parent.entry_name # if uniprot !='glp1r_human': # continue d = cache.get(pdbname+"_auto_d") # d = None if not d: d = fetch_pdb_info(pdbname,protein) cache.set(pdbname+"_auto_d",d,60*60*24) # print('pdb',d['mutations']) cons_muts = ConstructMutation.objects.filter(construct = c) for m in cons_muts: seq_pos = m.sequence_number found = False for pdb_m in d['mutations']: if int(pdb_m['pos']) == seq_pos and pdb_m['wt']==m.wild_type_amino_acid and (pdb_m['mut']==m.mutated_amino_acid): found = True break if not found: ignore = False for m_xlx in self.excel_mutations: if m_xlx[6]==uniprot and int(m_xlx[9])==seq_pos and m_xlx[11]==m.mutated_amino_acid and m_xlx[10]==m.wild_type_amino_acid: found = False if pdbname in m_xlx[1] or m_xlx[1]=='': found = True if '%'+pdbname in m_xlx[1]: found = False if found: if m_xlx[16]!='Non-receptor' and m_xlx[16]!='Wrong annotation - remove!': ignore = True if ignore: issues.append(('In excel but missing in pdb?',seq_pos,m.mutated_amino_acid)) else: issues.append(('Not in excel nor pdb, deleting',seq_pos,m.mutated_amino_acid)) m.delete() print(issues) continue issues.append(('missing in pdb?',seq_pos,m.mutated_amino_acid)) mut_aa = m.mutated_amino_acid pos = m.sequence_number wt_aa = m.wild_type_amino_acid annotated_effect = "Not identified in PDB -- perhaps delete?" res = Residue.objects.get(protein_conformation__protein=protein.parent, sequence_number=pos) seg = res.protein_segment.slug if res.generic_number: gn = res.generic_number.label else: gn = '' csv_rows.append(['',pdbname,'','','','',protein.parent.entry_name,seg,gn,pos,wt_aa,mut_aa,'','','','','',annotated_effect]) for m in d['mutations']: cons_muts = ConstructMutation.objects.filter(construct = c, sequence_number = m['pos'], mutated_amino_acid = m['mut'], wild_type_amino_acid = m['wt']) if not cons_muts.exists(): # print('missing',m) ignore = False for m_xlx in self.excel_mutations: if m_xlx[6]==uniprot and int(m_xlx[9])==m['pos']: found = False if pdbname in m_xlx[1] or m_xlx[1]=='': found = True if '%'+pdbname in m_xlx[1]: found = False if found: if m_xlx[16]=='Non-receptor' or m_xlx[16]=='Wrong annotation - remove!': ignore = True if ignore: continue issues.append(('{}{}{} ({})'.format(m['wt'],m['pos'],m['mut'],m['type']),' not in db, nor to be ignored in excel')) mut_aa = m['mut'] pos = m['pos'] wt_aa = m['wt'] annotated_effect = m['type'] res = Residue.objects.get(protein_conformation__protein=protein.parent, sequence_number=pos) seg = res.protein_segment.slug if res.generic_number: gn = res.generic_number.label else: gn = '' csv_rows.append(['',pdbname,'','','','',protein.parent.entry_name,seg,gn,pos,wt_aa,mut_aa,'','','','','',annotated_effect]) if issues: print(pdbname) for i in issues: print(i) import csv with open('construct_mut_issues.csv', 'w') as f: writer = csv.writer(f, delimiter = '\t') writer.writerows(csv_rows)