def fetch_publication(self, publication_doi): """ fetch publication with Publication model requires: publication doi or pmid """ try: float(publication_doi) publication_doi = str(int(publication_doi)) except ValueError: pass if publication_doi.isdigit(): # assume pubmed pub_type = 'pubmed' else: # assume doi pub_type = 'doi' if publication_doi not in self.publication_cache: try: wl = WebLink.objects.get(index=publication_doi, web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=publication_doi, web_resource=WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=publication_doi, web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=publication_doi) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=publication_doi) try: pub.save() except: self.mylog.debug( "publication fetching error | module: fetch_publication. Row # is : " + str(publication_doi) + ' ' + pub_type) # if something off with publication, skip. self.publication_cache[publication_doi] = pub else: pub = self.publication_cache[publication_doi] return pub
def create_publication(self, doi, wr): '''Create WebLink and Publication objects''' if doi!='': try: pub = Publication.objects.get(web_link__index=doi, web_link__web_resource=wr) except Publication.DoesNotExist as e: pub = Publication() wl, created = WebLink.objects.get_or_create(index=doi, web_resource=wr) pub.web_link = wl pub.update_from_doi(doi=doi) pub.save() self.logger.info('Created Publication:'+str(pub)) return pub else: return None
def main_func(self, positions, iteration,count,lock): missing_proteins = {} c = 0 skipped = 0 rows = self.data_all while count.value<len(rows): with lock: r = rows[count.value] count.value +=1 current = time.time() c += 1 # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) except Exception as msg: print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() elif r['protein'] not in missing_proteins: # Can contain code to try to figure out what protein it is. pass else: missing_proteins[r['protein']] += 1 continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue self.logger.info('Parsed '+str(c)+' bias data entries. Skipped '+str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True) print(missing_proteins)
def new_xtals(self, uniprot): ''' List GPCR crystal structures missing from GPCRdb and the yaml files. Adds missing structures to DB. ''' structs = self.pdb_request_by_uniprot(uniprot) try: protein = Protein.objects.get(accession=uniprot) except: protein = None try: x50s = Residue.objects.filter(protein_conformation__protein=protein,generic_number__label__in=['1x50','2x50','3x50','4x50','5x50','6x50','7x50']) except: x50s = None if structs!=['null']: for s in structs: missing_from_db, missing_yaml = False, False try: st_obj = Structure.objects.get(pdb_code__index=s) except: if s not in self.exceptions: check = self.pdb_request_by_pdb(s) if check==1: self.db_list.append(s) missing_from_db = True if s not in self.yamls and s not in self.exceptions: if s not in self.db_list: check = self.pdb_request_by_pdb(s) else: check = 1 if check==1: self.yaml_list.append(s) missing_yaml = True if not missing_from_db: continue try: pdb_data_dict = fetch_pdb_info(s, protein, new_xtal=True) exp_method = pdb_data_dict['experimental_method'] if exp_method=='Electron Microscopy': st_type = StructureType.objects.get(slug='electron-microscopy') elif exp_method=='X-ray diffraction': st_type = StructureType.objects.get(slug='x-ray-diffraction') if 'deletions' in pdb_data_dict: for d in pdb_data_dict['deletions']: presentx50s = [] for x in x50s: if not d['start']<x.sequence_number<d['end']: presentx50s.append(x) # Filter out ones without all 7 x50 positions present in the xtal if len(presentx50s)!=7: try: del self.db_list[self.db_list.index(s)] missing_from_db = False del self.yaml_list[self.yaml_list.index(s)] except: pass else: print('Warning: no deletions in pdb info, check {}'.format(s)) continue if missing_from_db: pref_chain = '' resolution = pdb_data_dict['resolution'] pdb_code, created = WebLink.objects.get_or_create(index=s, web_resource=WebResource.objects.get(slug='pdb')) pdbl = PDB.PDBList() pdbl.retrieve_pdb_file(s, pdir='./', file_format="pdb") with open('./pdb{}.ent'.format(s).lower(),'r') as f: lines = f.readlines() pdb_file = '' publication_date, pubmed, doi = '','','' state = ProteinState.objects.get(slug='inactive') new_prot, created = Protein.objects.get_or_create(entry_name=s.lower(), accession=None, name=s.lower(), sequence=pdb_data_dict['wt_seq'], family=protein.family, parent=protein, residue_numbering_scheme=protein.residue_numbering_scheme, sequence_type=ProteinSequenceType.objects.get(slug='mod'), source=ProteinSource.objects.get(name='OTHER'), species=protein.species) new_prot_conf, created = ProteinConformation.objects.get_or_create(protein=new_prot, state=state, template_structure=None) for line in lines: if line.startswith('REVDAT 1'): publication_date = line[13:22] if line.startswith('JRNL PMID'): pubmed = line[19:].strip() if line.startswith('JRNL DOI'): doi = line[19:].strip() pdb_file+=line pdb_data, created = PdbData.objects.get_or_create(pdb=pdb_file) d = datetime.strptime(publication_date,'%d-%b-%y') publication_date = d.strftime('%Y-%m-%d') try: if doi!='': try: publication = Publication.objects.get(web_link__index=doi) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=doi, web_resource__slug='doi') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=doi, web_resource = WebResource.objects.get(slug='doi')) p.web_link = wl p.update_from_doi(doi=doi) p.save() publication = p elif pubmed!='': try: publication = Publication.objects.get(web_link__index=pubmed) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=pubmed, web_resource__slug='pubmed') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=pubmed, web_resource = WebResource.objects.get(slug='pubmed')) p.web_link = wl p.update_from_pubmed_data(index=pubmed) p.save() publication = p except: pass pcs = PdbChainSelector(s, protein) pcs.run_dssp() preferred_chain = pcs.select_chain() # Run state identification # Create yaml files with open('../../data/protwis/gpcr/structure_data/constructs/{}.yaml'.format(pdb_code.index), 'w') as construct_file: yaml.dump({'name': pdb_code.index.lower(), 'protein': protein.entry_name}, construct_file, indent=4) with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'w') as structure_file: struct_yaml_dict = {'construct': pdb_code.index.lower(), 'pdb': pdb_code.index, 'preferred_chain': preferred_chain, 'auxiliary_protein': '', 'ligand': {'name': 'None', 'pubchemId': 'None', 'title': 'None', 'role': '.nan', 'type': 'None'}, 'signaling_protein': 'None', 'state': 'Inactive'} auxiliary_proteins, ligands = [], [] if pdb_data_dict['ligands']!='None': for key, values in pdb_data_dict['ligands'].items(): if key in ['SO4','NA','CLR','OLA','OLB','OLC','TAR','NAG','EPE','BU1','ACM','GOL','PEG','PO4','TLA','BOG','CIT','PLM','BMA','MAN','MLI','PGE']: continue else: ligands.append({'name': key, 'pubchemId': 'None', 'title': pdb_data_dict['ligands'][key]['comp_name'], 'role': '.nan', 'type': 'None'}) for key, values in pdb_data_dict['auxiliary'].items(): if pdb_data_dict['auxiliary'][key]['subtype'] in ['Expression tag', 'Linker']: continue else: auxiliary_proteins.append(pdb_data_dict['auxiliary'][key]['subtype']) for key, values in pdb_data_dict['construct_sequences'].items(): if key!=protein.entry_name and key not in struct_yaml_dict['auxiliary_protein']: if 'arrestin' in key: struct_yaml_dict['signaling_protein'] = key if len(auxiliary_proteins)>1: struct_yaml_dict['auxiliary_protein'] = ', '.join(auxiliary_proteins) if len(ligands)>1: struct_yaml_dict['ligand'] = ligands yaml.dump(struct_yaml_dict, structure_file, indent=4, default_flow_style=False) # Build residue table for structure build_structure_command = shlex.split('/env/bin/python3 manage.py build_structures -f {}.yaml'.format(pdb_code.index)) subprocess.call(build_structure_command) # Check state struct = Structure.objects.get(pdb_code__index=pdb_code.index) pi = PdbStateIdentifier(struct) pi.run() if pi.state!=None: Structure.objects.filter(pdb_code__index=pdb_code.index).update(state=pi.state) print(pi.state, pi.activation_value) with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'r') as yf: struct_yaml = yaml.load(yf) struct_yaml['state'] = pi.state.name try: struct_yaml['distance'] = round(float(pi.activation_value), 2) except: struct_yaml['distance'] = None with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'w') as struct_yaml_file: yaml.dump(struct_yaml, struct_yaml_file, indent=4, default_flow_style=False) # Check sodium pocket new_prot_conf.sodium_pocket() print('{} added to db (preferred_chain chain: {})'.format(s, preferred_chain)) except Exception as msg: print(s, msg)
def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get( web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r[ 'opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, #raw = raw_experiment, #raw_experiment, OR None optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped))
def main_func(self, positions, iteration, count, lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value < len(rows): with lock: r = rows[count.value] count.value += 1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) except Exception as msg: print( 'Something errored with ligand, aborting entry of mutation', r['ligand_name'], r['ligand_type'], r['ligand_id'], r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein = Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein = Protein.objects.filter( web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein = protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml=True) try: real_uniprot = uniprot_protein.find( './/{http://uniprot.org/uniprot}name' ).text.lower() protein = Protein.objects.get( entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error( 'Skipped due to no protein ' + r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter( slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get( slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with", r['ligand_class'], slugify(r['ligand_class'])[:50]) l_role, created = LigandRole.objects.get_or_create( slug=slugify(r['ligand_class']) [:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) except: print(r) typefold = r['exp_type'] + "_log" elif "%" == r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round( r['exp_wt_value'] / r['exp_mu_value_raw'], 3) else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange > 0 and foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group=r['submitting_group'], data_container=r['data_container'], data_container_number=r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange, opt_receptor_expression=r['opt_receptor_expression'], opt_basal_activity=r['opt_basal_activity'], opt_gain_of_activity=r['opt_gain_of_activity'], opt_ligand_emax=r['opt_ligand_emax'], opt_agonist=r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) # current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True)
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join([self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:]=='xlsx' or source_file[-3:]=='xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:]=='yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__","_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to'])>1 or len(d['mutation_from'])>1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c%1000==0: self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type']=='PubChem CID' or r['ligand_type']=='SMILES': if r['ligand_type']=='PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type']=='SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get(name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create(properities = l_canonical.properities, name = ligand_name, canonical = False) if created: self.logger.info('Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem(pubchem_lookup_value, r['ligand_id'], lt, ligand_name) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,sequence_number=r['mutation_pos']) if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue ' + r['protein'] + ' pos:'+str(r['mutation_pos'])) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, raw = raw_experiment, optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) mut_id = obj.id inserted += 1 self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1),reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def build_g_prot_struct(self, alpha_prot, pdb, data): ss = SignprotStructure() pdb_code, p_c = WebLink.objects.get_or_create( index=pdb, web_resource=WebResource.objects.get(slug='pdb')) pub_date = data['release_date'] # Structure type if 'x-ray' in data['method'].lower(): structure_type_slug = 'x-ray-diffraction' elif 'electron' in data['method'].lower(): structure_type_slug = 'electron-microscopy' else: structure_type_slug = '-'.join(data['method'].lower().split(' ')) try: structure_type = StructureType.objects.get( slug=structure_type_slug) except StructureType.DoesNotExist as e: structure_type, c = StructureType.objects.get_or_create( slug=structure_type_slug, name=data['method']) self.logger.info('Created StructureType:' + str(structure_type)) # Publication if data['doi']: try: pub = Publication.objects.get(web_link__index=data['doi']) except Publication.DoesNotExist as e: pub = Publication() wl, created = WebLink.objects.get_or_create( index=data['doi'], web_resource=WebResource.objects.get(slug='doi')) pub.web_link = wl pub.update_from_pubmed_data(index=data['doi']) pub.save() self.logger.info('Created Publication:' + str(pub)) else: if data['pubmedId']: try: pub = Publication.objects.get( web_link__index=data['pubmedId']) except Publication.DoesNotExist as e: pub = Publication() wl, created = WebLink.objects.get_or_create( index=data['pubmedId'], web_resource=WebResource.objects.get(slug='pubmed')) pub.web_link = wl pub.update_from_pubmed_data(index=data['pubmedId']) pub.save() self.logger.info('Created Publication:' + str(pub)) else: pub = None ss.pdb_code = pdb_code ss.structure_type = structure_type ss.resolution = data['resolution'] ss.publication_date = pub_date ss.publication = pub ss.protein = alpha_prot ss.save() # Stabilizing agent for o in data['other']: if len(o) > 75: continue if o == 'REGULATOR OF G-PROTEIN SIGNALING 14': o = 'Regulator of G-protein signaling 14' elif o == 'Nanobody 35': o = 'Nanobody-35' elif o == 'ADENYLATE CYCLASE, TYPE V': o = 'Adenylate cyclase, type V' elif o == '1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta-3': o = '1-phosphatidylinositol 4,5-bisphosphate phosphodiesterase beta-3' stabagent, sa_created = StructureStabilizingAgent.objects.get_or_create( slug=o.replace(' ', '-').replace(' ', '-'), name=o) ss.stabilizing_agents.add(stabagent) ss.save() # Extra proteins # Alpha - ### A bit redundant, consider changing this in the future if data['alpha']: alpha_sep = SignprotStructureExtraProteins() alpha_sep.wt_protein = alpha_prot alpha_sep.structure = ss alpha_sep.protein_conformation = ProteinConformation.objects.get( protein=alpha_prot) alpha_sep.display_name = self.display_name_lookup[ alpha_prot.family.name] alpha_sep.note = None alpha_sep.chain = data['alpha_chain'] alpha_sep.category = 'G alpha' cov = round(data['alpha_coverage'] / len(alpha_prot.sequence) * 100) if cov > 100: self.logger.warning( "SignprotStructureExtraProtein Alpha subunit sequence coverage of {} is {}% which is longer than 100% in structure {}" .format(alpha_sep, cov, ss)) cov = 100 alpha_sep.wt_coverage = cov alpha_sep.save() # ss.extra_proteins.add(alpha_sep) # Beta if data['beta']: beta_prot = Protein.objects.get(accession=data['beta']) beta_sep = SignprotStructureExtraProteins() beta_sep.wt_protein = beta_prot beta_sep.structure = ss beta_sep.protein_conformation = ProteinConformation.objects.get( protein=beta_prot) beta_sep.display_name = self.display_name_lookup[beta_prot.name] beta_sep.note = None beta_sep.chain = data['beta_chain'] beta_sep.category = 'G beta' beta_sep.wt_coverage = None beta_sep.save() # ss.extra_proteins.add(beta_sep) # Gamma if data['gamma']: gamma_prot = Protein.objects.get(accession=data['gamma']) gamma_sep = SignprotStructureExtraProteins() gamma_sep.wt_protein = gamma_prot gamma_sep.structure = ss gamma_sep.protein_conformation = ProteinConformation.objects.get( protein=gamma_prot) gamma_sep.display_name = self.display_name_lookup[gamma_prot.name] gamma_sep.note = None gamma_sep.chain = data['gamma_chain'] gamma_sep.category = 'G gamma' gamma_sep.wt_coverage = None gamma_sep.save() # ss.extra_proteins.add(gamma_sep) # ss.save() self.logger.info('Created SignprotStructure: {}'.format(ss.pdb_code))
def build_g_prot_struct(self, alpha_prot, pdb, data): ss = SignprotStructure() pdb_code, p_c = WebLink.objects.get_or_create( index=pdb, web_resource=WebResource.objects.get(slug="pdb")) pub_date = data["release_date"] # Structure type if "x-ray" in data["method"].lower(): structure_type_slug = "x-ray-diffraction" elif "electron" in data["method"].lower(): structure_type_slug = "electron-microscopy" else: structure_type_slug = "-".join(data["method"].lower().split(" ")) try: structure_type = StructureType.objects.get( slug=structure_type_slug) except StructureType.DoesNotExist as e: structure_type, c = StructureType.objects.get_or_create( slug=structure_type_slug, name=data["method"]) self.logger.info("Created StructureType:" + str(structure_type)) # Publication if data["doi"]: try: pub = Publication.objects.get(web_link__index=data["doi"]) except Publication.DoesNotExist as e: pub = Publication() wl, created = WebLink.objects.get_or_create( index=data["doi"], web_resource=WebResource.objects.get(slug="doi")) pub.web_link = wl pub.update_from_pubmed_data(index=data["doi"]) pub.save() self.logger.info("Created Publication:" + str(pub)) else: if data["pubmedId"]: try: pub = Publication.objects.get( web_link__index=data["pubmedId"]) except Publication.DoesNotExist as e: pub = Publication() wl, created = WebLink.objects.get_or_create( index=data["pubmedId"], web_resource=WebResource.objects.get(slug="pubmed")) pub.web_link = wl pub.update_from_pubmed_data(index=data["pubmedId"]) pub.save() self.logger.info("Created Publication:" + str(pub)) else: pub = None # PDB data url = 'https://www.rcsb.org/pdb/files/{}.pdb'.format(pdb) req = urllib.request.Request(url) with urllib.request.urlopen(req) as response: pdbdata_raw = response.read().decode('utf-8') pdbdata_object = PdbData.objects.get_or_create(pdb=pdbdata_raw)[0] ss.pdb_code = pdb_code ss.structure_type = structure_type ss.resolution = data["resolution"] ss.publication_date = pub_date ss.publication = pub ss.protein = alpha_prot ss.pdb_data = pdbdata_object ss.save() # Stabilizing agent for o in data["other"]: if len(o) > 75: continue if o == "REGULATOR OF G-PROTEIN SIGNALING 14": o = "Regulator of G-protein signaling 14" elif o == "Nanobody 35": o = "Nanobody-35" elif o == "ADENYLATE CYCLASE, TYPE V": o = "Adenylate cyclase, type V" elif o == "1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta-3": o = "1-phosphatidylinositol 4,5-bisphosphate phosphodiesterase beta-3" stabagent, sa_created = StructureStabilizingAgent.objects.get_or_create( slug=o.replace(" ", "-").replace(" ", "-"), name=o) ss.stabilizing_agents.add(stabagent) ss.save() # Extra proteins # Alpha - ### A bit redundant, consider changing this in the future if data["alpha"]: alpha_sep = SignprotStructureExtraProteins() alpha_sep.wt_protein = alpha_prot alpha_sep.structure = ss alpha_sep.protein_conformation = ProteinConformation.objects.get( protein=alpha_prot) alpha_sep.display_name = self.display_name_lookup[ alpha_prot.family.name] alpha_sep.note = None alpha_sep.chain = data["alpha_chain"] alpha_sep.category = "G alpha" cov = round(data["alpha_coverage"] / len(alpha_prot.sequence) * 100) if cov > 100: self.logger.warning( "SignprotStructureExtraProtein Alpha subunit sequence coverage of {} is {}% which is longer than 100% in structure {}" .format(alpha_sep, cov, ss)) cov = 100 alpha_sep.wt_coverage = cov alpha_sep.save() # ss.extra_proteins.add(alpha_sep) # Beta if data["beta"]: beta_prot = Protein.objects.get(accession=data["beta"]) beta_sep = SignprotStructureExtraProteins() beta_sep.wt_protein = beta_prot beta_sep.structure = ss beta_sep.protein_conformation = ProteinConformation.objects.get( protein=beta_prot) beta_sep.display_name = self.display_name_lookup[beta_prot.name] beta_sep.note = None beta_sep.chain = data["beta_chain"] beta_sep.category = "G beta" beta_sep.wt_coverage = None beta_sep.save() # ss.extra_proteins.add(beta_sep) # Gamma if data["gamma"]: gamma_prot = Protein.objects.get(accession=data["gamma"]) gamma_sep = SignprotStructureExtraProteins() gamma_sep.wt_protein = gamma_prot gamma_sep.structure = ss gamma_sep.protein_conformation = ProteinConformation.objects.get( protein=gamma_prot) gamma_sep.display_name = self.display_name_lookup[gamma_prot.name] gamma_sep.note = None gamma_sep.chain = data["gamma_chain"] gamma_sep.category = "G gamma" gamma_sep.wt_coverage = None gamma_sep.save() # ss.extra_proteins.add(gamma_sep) # ss.save() self.logger.info("Created SignprotStructure: {}".format(ss.pdb_code))
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] for source_file in filenames: source_file_path = os.sep.join([self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is this a representative structure (will be used to guide structure-based alignments)? representative = False if 'representative' in sd and sd['representative']: representative = True # only process representative structures on first iteration if not representative and iteration == 1: continue # skip representative structures on second iteration if representative and iteration == 2: continue # is there a construct? if 'construct' not in sd: self.logger.error('No construct specified, skipping!') continue # does the construct exists? try: con = Protein.objects.get(entry_name=sd['construct']) except Protein.DoesNotExist: self.logger.error('Construct {} does not exists, skipping!'.format(sd['construct'])) continue # create a structure record try: s = Structure.objects.get(protein_conformation__protein=con) except Structure.DoesNotExist: s = Structure() s.representative = representative # protein state if 'state' not in sd: self.logger.warning('State not defined, using default state {}'.format( settings.DEFAULT_PROTEIN_STATE)) state = settings.DEFAULT_STATE.title() else: state = sd['state'] state_slug = slugify(state) try: ps, created = ProteinState.objects.get_or_create(slug=state_slug, defaults={'name': state}) if created: self.logger.info('Created protein state {}'.format(ps.name)) except IntegrityError: ps = ProteinState.objects.get(slug=state_slug) s.state = ps # protein conformation try: s.protein_conformation = ProteinConformation.objects.get(protein=con) except ProteinConformation.DoesNotExist: self.logger.error('Protein conformation for construct {} does not exists'.format(con)) continue if s.protein_conformation.state is not state: ProteinConformation.objects.filter(protein=con).update(state=ps) # get the PDB file and save to DB sd['pdb'] = sd['pdb'].upper() if not os.path.exists(self.pdb_data_dir): os.makedirs(self.pdb_data_dir) pdb_path = os.sep.join([self.pdb_data_dir, sd['pdb'] + '.pdb']) if not os.path.isfile(pdb_path): self.logger.info('Fetching PDB file {}'.format(sd['pdb'])) url = 'http://www.rcsb.org/pdb/files/%s.pdb' % sd['pdb'] pdbdata_raw = urlopen(url).read().decode('utf-8') with open(pdb_path, 'w') as f: f.write(pdbdata_raw) else: with open(pdb_path, 'r') as pdb_file: pdbdata_raw = pdb_file.read() pdbdata, created = PdbData.objects.get_or_create(pdb=pdbdata_raw) s.pdb_data = pdbdata # UPDATE HETSYN with its PDB reference instead + GRAB PUB DATE, PMID, DOI AND RESOLUTION hetsyn = {} hetsyn_reverse = {} for line in pdbdata_raw.splitlines(): if line.startswith('HETSYN'): m = re.match("HETSYN[\s]+([\w]{3})[\s]+(.+)",line) ### need to fix bad PDB formatting where col4 and col5 are put together for some reason -- usually seen when the id is +1000 if (m): hetsyn[m.group(2).strip()] = m.group(1).upper() hetsyn_reverse[m.group(1)] = m.group(2).strip().upper() if line.startswith('HETNAM'): m = re.match("HETNAM[\s]+([\w]{3})[\s]+(.+)",line) ### need to fix bad PDB formatting where col4 and col5 are put together for some reason -- usually seen when the id is +1000 if (m): hetsyn[m.group(2).strip()] = m.group(1).upper() hetsyn_reverse[m.group(1)] = m.group(2).strip().upper() if line.startswith('REVDAT 1'): sd['publication_date'] = line[13:22] if line.startswith('JRNL PMID'): sd['pubmed_id'] = line[19:].strip() if line.startswith('JRNL DOI'): sd['doi_id'] = line[19:].strip() if len(hetsyn) == 0: self.logger.info("PDB file contained NO hetsyn") with open(pdb_path,'r') as header: header_dict = parse_pdb_header(header) sd['publication_date'] = header_dict['release_date'] sd['resolution'] = str(header_dict['resolution']).strip() sd['structure_method'] = header_dict['structure_method'] # structure type if 'structure_method' in sd and sd['structure_method']: structure_type = sd['structure_method'].capitalize() structure_type_slug = slugify(sd['structure_method']) try: st, created = StructureType.objects.get_or_create(slug=structure_type_slug, defaults={'name': structure_type}) if created: self.logger.info('Created structure type {}'.format(st)) except IntegrityError: st = StructureType.objects.get(slug=structure_type_slug) s.structure_type = st else: self.logger.warning('No structure type specified in PDB file {}'.format(sd['pdb'])) matched = 0 if 'ligand' in sd and sd['ligand']: if isinstance(sd['ligand'], list): ligands = sd['ligand'] else: ligands = [sd['ligand']] for ligand in ligands: if 'name' in ligand: if ligand['name'].upper() in hetsyn: self.logger.info('Ligand {} matched to PDB records'.format(ligand['name'])) matched = 1 ligand['name'] = hetsyn[ligand['name'].upper()] elif ligand['name'].upper() in hetsyn_reverse: matched = 1 if matched==0 and len(hetsyn)>0: self.logger.info('No ligand names found in HET in structure {}'.format(sd['pdb'])) # REMOVE? can be used to dump structure files with updated ligands # yaml.dump(sd, open(source_file_path, 'w'), indent=4) # pdb code if 'pdb' in sd: try: web_resource = WebResource.objects.get(slug='pdb') except: # abort if pdb resource is not found raise Exception('PDB resource not found, aborting!') s.pdb_code, created = WebLink.objects.get_or_create(index=sd['pdb'], web_resource=web_resource) else: self.logger.error('PDB code not specified for structure {}, skipping!'.format(sd['pdb'])) continue # insert into plain text fields if 'preferred_chain' in sd: s.preferred_chain = sd['preferred_chain'] else: self.logger.warning('Preferred chain not specified for structure {}'.format(sd['pdb'])) if 'resolution' in sd: s.resolution = float(sd['resolution']) else: self.logger.warning('Resolution not specified for structure {}'.format(sd['pdb'])) if 'publication_date' in sd: s.publication_date = sd['publication_date'] else: self.logger.warning('Publication date not specified for structure {}'.format(sd['pdb'])) # publication try: if 'doi_id' in sd: try: s.publication = Publication.objects.get(web_link__index=sd['doi_id']) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=sd['doi_id'], web_resource__slug='doi') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=sd['doi_id'], web_resource = WebResource.objects.get(slug='doi')) p.web_link = wl p.update_from_doi(doi=sd['doi_id']) p.save() s.publication = p elif 'pubmed_id' in sd: try: s.publication = Publication.objects.get(web_link__index=sd['pubmed_id']) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=sd['pubmed_id'], web_resource__slug='pubmed') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=sd['pubmed_id'], web_resource = WebResource.objects.get(slug='pubmed')) p.web_link = wl p.update_from_pubmed_data(index=sd['pubmed_id']) p.save() s.publication = p except: self.logger.error('Error saving publication'.format(ps.name)) # save structure before adding M2M relations s.save() #Delete previous interaction data to prevent errors. ResidueFragmentInteraction.objects.filter(structure_ligand_pair__structure=s).delete() StructureLigandInteraction.objects.filter(structure=s).delete() #Remove previous Rotamers/Residues to prepare repopulate Fragment.objects.filter(structure=s).delete() Rotamer.objects.filter(structure=s).all().delete() Residue.objects.filter(protein_conformation=s.protein_conformation).all().delete() # endogenous ligand(s) default_ligand_type = 'Small molecule' if representative and 'endogenous_ligand' in sd and sd['endogenous_ligand']: if isinstance(sd['endogenous_ligand'], list): endogenous_ligands = sd['endogenous_ligand'] else: endogenous_ligands = [sd['endogenous_ligand']] for endogenous_ligand in endogenous_ligands: if endogenous_ligand['type']: lt, created = LigandType.objects.get_or_create(slug=slugify(endogenous_ligand['type']), defaults={'name': endogenous_ligand['type']}) else: lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) ligand = Ligand() if 'iupharId' not in endogenous_ligand: endogenous_ligand['iupharId'] = 0 ligand = ligand.load_by_gtop_id(endogenous_ligand['name'], endogenous_ligand['iupharId'], lt) try: s.protein_conformation.protein.parent.endogenous_ligands.add(ligand) except IntegrityError: self.logger.info('Endogenous ligand for protein {}, already added. Skipping.'.format( s.protein_conformation.protein.parent)) # ligands if 'ligand' in sd and sd['ligand']: if isinstance(sd['ligand'], list): ligands = sd['ligand'] else: ligands = [sd['ligand']] for ligand in ligands: l = False peptide_chain = "" if 'chain' in ligand: peptide_chain = ligand['chain'] ligand['name'] = 'pep' if ligand['name'] and ligand['name'] != 'None': # some inserted as none. # use annoted ligand type or default type if ligand['type']: lt, created = LigandType.objects.get_or_create(slug=slugify(ligand['type']), defaults={'name': ligand['type']}) else: lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) # set pdb reference for structure-ligand interaction pdb_reference = ligand['name'] # use pubchem_id if 'pubchemId' in ligand and ligand['pubchemId'] and ligand['pubchemId'] != 'None': # create ligand l = Ligand() # update ligand by pubchem id ligand_title = False if 'title' in ligand and ligand['title']: ligand_title = ligand['title'] l = l.load_from_pubchem('cid', ligand['pubchemId'], lt, ligand_title) # if no pubchem id is specified, use name else: # use ligand title, if specified if 'title' in ligand and ligand['title']: ligand['name'] = ligand['title'] # create empty properties lp = LigandProperities.objects.create() # create the ligand try: l, created = Ligand.objects.get_or_create(name=ligand['name'], canonical=True, defaults={'properities': lp, 'ambigious_alias': False}) if created: self.logger.info('Created ligand {}'.format(ligand['name'])) else: pass except IntegrityError: l = Ligand.objects.get(name=ligand['name'], canonical=True) # save ligand l.save() else: continue # structure-ligand interaction if l and ligand['role']: role_slug = slugify(ligand['role']) try: lr, created = LigandRole.objects.get_or_create(slug=role_slug, defaults={'name': ligand['role']}) if created: self.logger.info('Created ligand role {}'.format(ligand['role'])) except IntegrityError: lr = LigandRole.objects.get(slug=role_slug) i, created = StructureLigandInteraction.objects.get_or_create(structure=s, ligand=l, ligand_role=lr, annotated=True, defaults={'pdb_reference': pdb_reference}) if i.pdb_reference != pdb_reference: i.pdb_reference = pdb_reference i.save() # structure segments if 'segments' in sd and sd['segments']: for segment, positions in sd['segments'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue struct_seg, created = StructureSegment.objects.update_or_create(structure=s, protein_segment=protein_segment, defaults={'start': positions[0], 'end': positions[1]}) # all representive structures should have defined segments elif representative: self.logger.warning('Segments not defined for representative structure {}'.format(sd['pdb'])) # structure segments for modeling if 'segments_in_structure' in sd and sd['segments_in_structure']: for segment, positions in sd['segments_in_structure'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue struct_seg_mod, created = StructureSegmentModeling.objects.update_or_create(structure=s, protein_segment=protein_segment, defaults={'start': positions[0], 'end': positions[1]}) # structure coordinates if 'coordinates' in sd and sd['coordinates']: for segment, coordinates in sd['coordinates'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue # fetch (create if needed) coordinates description try: description, created = StructureCoordinatesDescription.objects.get_or_create( text=coordinates) if created: self.logger.info('Created structure coordinate description {}'.format(coordinates)) except IntegrityError: description = StructureCoordinatesDescription.objects.get(text=coordinates) sc = StructureCoordinates() sc.structure = s sc.protein_segment = protein_segment sc.description = description sc.save() # structure engineering if 'engineering' in sd and sd['engineering']: for segment, engineering in sd['engineering'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue # fetch (create if needed) engineering description try: description, created = StructureEngineeringDescription.objects.get_or_create( text=engineering) if created: self.logger.info('Created structure coordinate description {}'.format(engineering)) except IntegrityError: description = StructureEngineeringDescription.objects.get(text=engineering) se = StructureEngineering() se.structure = s se.protein_segment = protein_segment se.description = description se.save() # protein anomalies scheme = s.protein_conformation.protein.residue_numbering_scheme if 'bulges' in sd and sd['bulges']: pa_slug = 'bulge' try: pab, created = ProteinAnomalyType.objects.get_or_create(slug=pa_slug, defaults={ 'name': 'Bulge'}) if created: self.logger.info('Created protein anomaly type {}'.format(pab)) except IntegrityError: pab = ProteinAnomalyType.objects.get(slug=pa_slug) for segment, bulges in sd['bulges'].items(): for bulge in bulges: try: gn, created = ResidueGenericNumber.objects.get_or_create(label=bulge, scheme=scheme, defaults={'protein_segment': ProteinSegment.objects.get( slug=segment)}) if created: self.logger.info('Created generic number {}'.format(gn)) except IntegrityError: gn = ResidueGenericNumber.objects.get(label=bulge, scheme=scheme) try: pa, created = ProteinAnomaly.objects.get_or_create(anomaly_type=pab, generic_number=gn) if created: self.logger.info('Created protein anomaly {}'.format(pa)) except IntegrityError: pa, created = ProteinAnomaly.objects.get(anomaly_type=pab, generic_number=gn) s.protein_anomalies.add(pa) if 'constrictions' in sd and sd['constrictions']: pa_slug = 'constriction' try: pac, created = ProteinAnomalyType.objects.get_or_create(slug=pa_slug, defaults={ 'name': 'Constriction'}) if created: self.logger.info('Created protein anomaly type {}'.format(pac)) except IntegrityError: pac = ProteinAnomalyType.objects.get(slug=pa_slug) for segment, constrictions in sd['constrictions'].items(): for constriction in constrictions: try: gn, created = ResidueGenericNumber.objects.get_or_create(label=constriction, scheme=scheme, defaults={'protein_segment': ProteinSegment.objects.get( slug=segment)}) if created: self.logger.info('Created generic number {}'.format(gn)) except IntegrityError: gn = ResidueGenericNumber.objects.get(label=constriction, scheme=scheme) try: pa, created = ProteinAnomaly.objects.get_or_create(anomaly_type=pac, generic_number=gn) if created: self.logger.info('Created protein anomaly {}'.format(pa)) except IntegrityError: pa, created = ProteinAnomaly.objects.get(anomaly_type=pac, generic_number=gn) s.protein_anomalies.add(pa) # stabilizing agents, FIXME - redesign this! # fusion proteins moved to constructs, use this for G-proteins and other agents? aux_proteins = [] if 'signaling_protein' in sd and sd['signaling_protein'] and sd['signaling_protein'] != 'None': aux_proteins.append('signaling_protein') if 'auxiliary_protein' in sd and sd['auxiliary_protein'] and sd['auxiliary_protein'] != 'None': aux_proteins.append('auxiliary_protein') for index in aux_proteins: if isinstance(sd[index], list): aps = sd[index] else: aps = [sd[index]] for aux_protein in aps: aux_protein_slug = slugify(aux_protein)[:50] try: sa, created = StructureStabilizingAgent.objects.get_or_create( slug=aux_protein_slug, defaults={'name': aux_protein}) except IntegrityError: sa = StructureStabilizingAgent.objects.get(slug=aux_protein_slug) s.stabilizing_agents.add(sa) # save structure s.save() self.logger.info('Calculate rotamers / residues') self.create_rotamers(s,pdb_path) self.logger.info('Calculate interactions') #Should not error anymore. If it does, fix. runcalculation(sd['pdb'],peptide_chain) parsecalculation(sd['pdb'],False)
def new_xtals(self, uniprot): ''' List GPCR crystal structures missing from GPCRdb and the yaml files. Adds missing structures to DB. ''' structs = self.pdb_request_by_uniprot(uniprot) try: protein = Protein.objects.get(accession=uniprot) except: protein = None try: x50s = Residue.objects.filter(protein_conformation__protein=protein,generic_number__label__in=['1x50','2x50','3x50','4x50','5x50','6x50','7x50']) except: x50s = None if structs!=['null']: for s in structs: missing_from_db, missing_yaml = False, False try: st_obj = Structure.objects.get(pdb_code__index=s) except: if s not in self.exceptions: check = self.pdb_request_by_pdb(s) if check==1: self.db_list.append(s) missing_from_db = True if s not in self.yamls and s not in self.exceptions: if s not in self.db_list: check = self.pdb_request_by_pdb(s) else: check = 1 if check==1: self.yaml_list.append(s) missing_yaml = True if not missing_from_db: continue # try: pdb_data_dict = fetch_pdb_info(s, protein, new_xtal=True) exp_method = pdb_data_dict['experimental_method'] if exp_method=='Electron Microscopy': st_type = StructureType.objects.get(slug='electron-microscopy') elif exp_method=='X-ray diffraction': st_type = StructureType.objects.get(slug='x-ray-diffraction') if 'deletions' in pdb_data_dict: for d in pdb_data_dict['deletions']: presentx50s = [] for x in x50s: if not d['start']<x.sequence_number<d['end']: presentx50s.append(x) # Filter out ones without all 7 x50 positions present in the xtal if len(presentx50s)!=7: try: del self.db_list[self.db_list.index(s)] missing_from_db = False del self.yaml_list[self.yaml_list.index(s)] except: pass else: print('Warning: no deletions in pdb info, check {}'.format(s)) continue if missing_from_db: pref_chain = '' resolution = pdb_data_dict['resolution'] pdb_code, created = WebLink.objects.get_or_create(index=s, web_resource=WebResource.objects.get(slug='pdb')) pdbl = PDB.PDBList() pdbl.retrieve_pdb_file(s, pdir='./', file_format="pdb") with open('./pdb{}.ent'.format(s).lower(),'r') as f: lines = f.readlines() pdb_file = '' publication_date, pubmed, doi = '','','' state = ProteinState.objects.get(slug='inactive') new_prot, created = Protein.objects.get_or_create(entry_name=s.lower(), accession=None, name=s.lower(), sequence=pdb_data_dict['wt_seq'], family=protein.family, parent=protein, residue_numbering_scheme=protein.residue_numbering_scheme, sequence_type=ProteinSequenceType.objects.get(slug='mod'), source=ProteinSource.objects.get(name='OTHER'), species=protein.species) new_prot_conf, created = ProteinConformation.objects.get_or_create(protein=new_prot, state=state, template_structure=None) for line in lines: if line.startswith('REVDAT 1'): publication_date = line[13:22] if line.startswith('JRNL PMID'): pubmed = line[19:].strip() if line.startswith('JRNL DOI'): doi = line[19:].strip() pdb_file+=line pdb_data, created = PdbData.objects.get_or_create(pdb=pdb_file) d = datetime.strptime(publication_date,'%d-%b-%y') publication_date = d.strftime('%Y-%m-%d') try: if doi!='': try: publication = Publication.objects.get(web_link__index=doi) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=doi, web_resource__slug='doi') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=doi, web_resource = WebResource.objects.get(slug='doi')) p.web_link = wl p.update_from_doi(doi=doi) p.save() publication = p elif pubmed!='': try: publication = Publication.objects.get(web_link__index=pubmed) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=pubmed, web_resource__slug='pubmed') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=pubmed, web_resource = WebResource.objects.get(slug='pubmed')) p.web_link = wl p.update_from_pubmed_data(index=pubmed) p.save() publication = p except: pass pcs = PdbChainSelector(s, protein) pcs.run_dssp() preferred_chain = pcs.select_chain() # Run state identification # Create yaml files with open(os.sep.join([settings.DATA_DIR, 'structure_data','constructs', '{}.yaml'.format(pdb_code.index)]), 'w') as construct_file: yaml.dump({'name': pdb_code.index.lower(), 'protein': protein.entry_name}, construct_file, indent=4) with open(os.sep.join([settings.DATA_DIR, 'structure_data','structures','{}.yaml'.format(pdb_code.index)]), 'w') as structure_file: struct_yaml_dict = {'construct': pdb_code.index.lower(), 'pdb': pdb_code.index, 'preferred_chain': preferred_chain, 'auxiliary_protein': '', 'ligand': {'name': 'None', 'pubchemId': 'None', 'title': 'None', 'role': '.nan', 'type': 'None'}, 'signaling_protein': 'None', 'state': 'Inactive'} auxiliary_proteins, ligands = [], [] if pdb_data_dict['ligands']!='None': for key, values in pdb_data_dict['ligands'].items(): if key in ['SO4','NA','CLR','OLA','OLB','OLC','TAR','NAG','EPE','BU1','ACM','GOL','PEG','PO4','TLA','BOG','CIT','PLM','BMA','MAN','MLI','PGE','SIN','PGO','MES','ZN','NO3','NI','MG','PG4']: continue else: ligands.append({'name': key, 'pubchemId': 'None', 'title': pdb_data_dict['ligands'][key]['comp_name'], 'role': '.nan', 'type': 'None'}) for key, values in pdb_data_dict['auxiliary'].items(): if pdb_data_dict['auxiliary'][key]['subtype'] in ['Expression tag', 'Linker']: continue else: auxiliary_proteins.append(pdb_data_dict['auxiliary'][key]['subtype']) for key, values in pdb_data_dict['construct_sequences'].items(): if key!=protein.entry_name and key not in struct_yaml_dict['auxiliary_protein']: if 'arrestin' in key: struct_yaml_dict['signaling_protein'] = key if len(auxiliary_proteins)>1: struct_yaml_dict['auxiliary_protein'] = ', '.join(auxiliary_proteins) if len(ligands)>1: struct_yaml_dict['ligand'] = ligands yaml.dump(struct_yaml_dict, structure_file, indent=4, default_flow_style=False) # Build residue table for structure build_structure_command = shlex.split('/env/bin/python3 manage.py build_structures -f {}.yaml'.format(pdb_code.index)) subprocess.call(build_structure_command) # Check state struct = Structure.objects.get(pdb_code__index=pdb_code.index) pi = PdbStateIdentifier(struct) pi.run() if pi.state!=None: Structure.objects.filter(pdb_code__index=pdb_code.index).update(state=pi.state) print(pi.state, pi.activation_value) with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'r') as yf: struct_yaml = yaml.load(yf) struct_yaml['state'] = pi.state.name try: struct_yaml['distance'] = round(float(pi.activation_value), 2) except: struct_yaml['distance'] = None with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'w') as struct_yaml_file: yaml.dump(struct_yaml, struct_yaml_file, indent=4, default_flow_style=False) # Check sodium pocket new_prot_conf.sodium_pocket() print('{} added to db (preferred_chain chain: {})'.format(s, preferred_chain))
def new_xtals(self, uniprot): ''' List GPCR crystal structures missing from GPCRdb and the yaml files. Adds missing structures to DB. ''' structs = self.pdb_request_by_uniprot(uniprot) try: protein = Protein.objects.get(accession=uniprot) except: protein = None try: x50s = Residue.objects.filter( protein_conformation__protein=protein, generic_number__label__in=[ '1x50', '2x50', '3x50', '4x50', '5x50', '6x50', '7x50' ]) except: x50s = None if structs != ['null']: for s in structs: missing_from_db = False try: st_obj = Structure.objects.get(pdb_code__index=s) except: if s not in self.exceptions: check = self.pdb_request_by_pdb(s) if check == 1: self.db_list.append(s) missing_from_db = True if s not in self.yamls and s not in self.exceptions: if s not in self.db_list: check = self.pdb_request_by_pdb(s) else: check = 1 if check == 1: self.yaml_list.append(s) if not missing_from_db: continue try: pdb_data_dict = fetch_pdb_info(s, protein) exp_method = pdb_data_dict['experimental_method'] if exp_method == 'Electron Microscopy': st_type, cr = StructureType.objects.get_or_create( slug='electron-microscopy', name=exp_method) elif exp_method == 'X-ray diffraction': st_type = StructureType.objects.get( slug='x-ray-diffraction') if 'deletions' in pdb_data_dict: for d in pdb_data_dict['deletions']: presentx50s = [] for x in x50s: if not d['start'] < x.sequence_number < d[ 'end']: presentx50s.append(x) # Filter out ones without all 7 x50 positions present in the xtal if len(presentx50s) != 7: try: del self.db_list[self.db_list.index(s)] missing_from_db = False del self.yaml_list[self.yaml_list.index(s)] except: pass if missing_from_db: pref_chain = '' resolution = pdb_data_dict['resolution'] pdb_code, created = WebLink.objects.get_or_create( index=s, web_resource=WebResource.objects.get(slug='pdb')) pdbl = PDB.PDBList() pdbl.retrieve_pdb_file(s, pdir='./', file_format="pdb") with open('./pdb{}.ent'.format(s).lower(), 'r') as f: lines = f.readlines() pdb_file = '' publication_date, pubmed, doi = '', '', '' state = ProteinState.objects.get(slug='inactive') new_prot, created = Protein.objects.get_or_create( entry_name=s.lower(), accession=None, name=s.lower(), sequence=pdb_data_dict['wt_seq'], family=protein.family, parent=protein, residue_numbering_scheme=protein. residue_numbering_scheme, sequence_type=ProteinSequenceType.objects.get( slug='mod'), source=ProteinSource.objects.get(name='OTHER'), species=protein.species) new_prot_conf, created = ProteinConformation.objects.get_or_create( protein=new_prot, state=state, template_structure=None) for line in lines: if line.startswith('REVDAT 1'): publication_date = line[13:22] if line.startswith('JRNL PMID'): pubmed = line[19:].strip() if line.startswith('JRNL DOI'): doi = line[19:].strip() pdb_file += line pdb_data, created = PdbData.objects.get_or_create( pdb=pdb_file) d = datetime.strptime(publication_date, '%d-%b-%y') publication_date = d.strftime('%Y-%m-%d') try: if doi != '': try: publication = Publication.objects.get( web_link__index=doi) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get( index=doi, web_resource__slug='doi') except WebLink.DoesNotExist: wl = WebLink.objects.create( index=doi, web_resource=WebResource.objects. get(slug='doi')) p.web_link = wl p.update_from_doi(doi=doi) p.save() publication = p elif pubmed != '': try: publication = Publication.objects.get( web_link__index=pubmed) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get( index=pubmed, web_resource__slug='pubmed') except WebLink.DoesNotExist: wl = WebLink.objects.create( index=pubmed, web_resource=WebResource.objects. get(slug='pubmed')) p.web_link = wl p.update_from_pubmed_data(index=pubmed) p.save() publication = p except: pass pcs = PdbChainSelector(s, protein) pcs.run_dssp() preferred_chain = pcs.select_chain() os.remove('./pdb{}.ent'.format(s).lower()) # Create new structure object Structure.objects.get_or_create( preferred_chain=preferred_chain, resolution=resolution, publication_date=publication_date, representative='f', pdb_code=pdb_code, pdb_data=pdb_data, protein_conformation=new_prot_conf, publication=publication, state=state, structure_type=st_type) print('{} added to db (preferred_chain chain: {})'. format(s, preferred_chain)) except Exception as msg: print(msg)
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join( [self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:] == 'xlsx' or source_file[-3:] == 'xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:] == 'yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__", "_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to']) > 1 or len( d['mutation_from'] ) > 1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c % 1000 == 0: self.logger.info('Parsed ' + str(c) + ' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type'] == 'PubChem CID' or r[ 'ligand_type'] == 'SMILES': if r['ligand_type'] == 'PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type'] == 'SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get( slug='pubchem') except: # abort if pdb resource is not found raise Exception( 'PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get( name=ligand_name, canonical=True, properities__web_links__web_resource= web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get( properities__web_links__web_resource= web_resource, properities__web_links__index=r[ 'ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create( properities=l_canonical.properities, name=ligand_name, canonical=False) if created: self.logger.info( 'Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem( pubchem_lookup_value, r['ligand_id'], lt, ligand_name) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, sequence_number=r['mutation_pos']) if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue ' + r['protein'] + ' pos:' + str(r['mutation_pos'])) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r[ 'exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r[ 'opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, raw=raw_experiment, optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) mut_id = obj.id inserted += 1 self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1), reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get(web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, #raw = raw_experiment, #raw_experiment, OR None optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
def main_func(self, positions, iteration,count,lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value<len(rows): with lock: r = rows[count.value] count.value +=1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) except Exception as msg: print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein=Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein=protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True) try: real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower() protein=Protein.objects.get(entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error('Skipped due to no protein '+ r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] ) l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); except: print(r) typefold = r['exp_type']+"_log" elif "%"==r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3); else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange>0 and foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group = r['submitting_group'], data_container = r['data_container'], data_container_number = r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange, opt_receptor_expression = r['opt_receptor_expression'], opt_basal_activity = r['opt_basal_activity'], opt_gain_of_activity = r['opt_gain_of_activity'], opt_ligand_emax = r['opt_ligand_emax'], opt_agonist = r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)