def fetch_ligand(self, ligand_id, smiles): """ fetch ligands with Ligand model requires: ligand id, ligand id type, ligand name requires: source_file name """ l = None try: if ligand_id in self.ligand_cache: l = self.ligand_cache[ligand_id] else: l = Ligand.objects.filter(properities__web_links__index=ligand_id).first() if l: cid = l.properities.web_links.filter(web_resource__slug = 'pubchem').first() if cid: cid = cid.index else: l = None else: l = get_or_make_ligand(smiles, 'SMILES', ligand_id, ) except Exception as msg: l = None # print('ligand_id---',l,'\n end') return l
def fetch_ligand(self, ligand_id, ligand_type, ligand_name, source_file): """ fetch ligands with Ligand model requires: ligand id, ligand id type, ligand name requires: source_file name """ l = None try: if ligand_id in self.ligand_cache: l = self.ligand_cache[ligand_id] else: l = get_or_make_ligand(ligand_id, ligand_type, ligand_name) self.ligand_cache[ligand_id] = l except Exception as msg: l = None # print('ligand_id---',l,'\n end') return l
def fetch_ligand(self, ligand_id, ligand_type, ligand_name, source_file): """ fetch ligands with Ligand model requires: ligand id, ligand id type, ligand name requires: source_file name """ l = None if str(ligand_id) in self.ligand_cache: if ligand_id in self.ligand_cache[str(ligand_id)]: l = self.ligand_cache[str(ligand_id)][ligand_id] else: self.ligand_cache[str(ligand_id)] = {} if not l: try: l = get_or_make_ligand( ligand_id, ligand_type, str(ligand_name)) except Exception as msg: l = None self.ligand_cache[str(ligand_name), ligand_id] = l self.mylog.exception("Protein fetching error | module: fetch_ligand. Row # is : ", ligand_name, ligand_type, ligand_id, source_file) return l
def fetch_ligand(self, ligand_id, ligand_type, ligand_name, source_file): """ fetch ligands with Ligand model requires: ligand id, ligand id type, ligand name requires: source_file name """ l = None try: if ligand_id in self.ligand_cache: l = self.ligand_cache[ligand_id] else: l = get_or_make_ligand(ligand_id, ligand_type, ligand_name) self.ligand_cache[ligand_id] = l if l == None: l = self.create_empty_ligand(ligand_name) except: web_resource = WebResource.objects.get(slug='pubchem') try: l = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) except: l = self.create_empty_ligand(ligand_name) # print('null ligand', l) return l
def fetch_ligand(self, ligand_id, smiles): """ fetch ligands with Ligand model requires: ligand id, ligand id type, ligand name requires: source_file name """ l = None try: if ligand_id in self.ligand_cache: l = self.ligand_cache[ligand_id] else: l = Ligand.objects.get(name=ligand_id) if l: return l else: l = get_or_make_ligand( smiles, 'SMILES', ligand_id, ) except Exception as msg: l = None # print('ligand_id---',l,'\n end') return l
def main_func(self, positions, iteration,count,lock): #####Create chembl compound link and connect it to the corresponding ligand/cid##### if iteration==0: # First load makes sure ligands are there list_of_chembl_ids = self.chembl_mol_ids while count.value<len(list_of_chembl_ids): with lock: chembl_ligand = list_of_chembl_ids[count.value] count.value +=1 if count.value % 1000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(list_of_chembl_ids))) l = Ligand.objects.filter(properities__web_links__web_resource__slug = 'chembl_ligand', properities__web_links__index=chembl_ligand).first() if l: cid = l.properities.web_links.filter(web_resource__slug = 'pubchem').first() if cid: cid = cid.index else: l = None # make sure code blow is run if not l: # if l already has chembl link, assume all is good. if chembl_ligand not in self.chembl_cid_dict.keys(): cids, not_found = self.find_cid_for_chembl(chembl_ligand) if not_found: print('SKIPPED: Could not determine CID',chembl_ligand,cids) continue else: cids = self.chembl_cid_dict[chembl_ligand] temp = str(cids).split(';') #perhaps we should load all of the CIDs cid = str(temp[0]) l = get_or_make_ligand(cid,'PubChem CID') #call the first cid if there are more than one if not l: print('SKIPPED: Ligand not found in PubChem', cid) continue if not l.properities.web_links.filter(web_resource__slug = 'pubchem',index = cid).exists(): # NO CID FOR LIGAND! Rare cases where SMILES was used for initial look up wl, created = WebLink.objects.get_or_create(index=cid, web_resource=self.wr_pubchem) l.properities.web_links.add(wl) if not l.properities.web_links.filter(web_resource__slug = 'chembl_ligand',index = chembl_ligand).exists(): wl, created = WebLink.objects.get_or_create(index=chembl_ligand, web_resource=self.wr) l.properities.web_links.add(wl) ###### Vendor stuff ###### if not len(l.properities.vendors.all()): # If it has some, assume they are all loaded cache_dir = ['pubchem', 'cid', 'vendors'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/categories/compound/$index/JSON/' vendors = fetch_from_web_api(url, cid, cache_dir) if vendors: for vendor_data in vendors['SourceCategories']['Categories'][0]['Sources'] : lv, created = LigandVendors.objects.get_or_create(slug = slugify(vendor_data['SourceName'])) lv.name = vendor_data['SourceName'] if 'SourceURL' in vendor_data: lv.url = vendor_data['SourceURL'] lv.save() if 'SID' in vendor_data: #print (vendor_data['SID']) lvls = LigandVendorLink.objects.filter(sid = vendor_data['SID'] ) if not lvls.exists(): lvl = LigandVendorLink() lvl.vendor = lv lvl.lp = l.properities lvl.sid = vendor_data['SID'] if 'RegistryID' in vendor_data: lvl.vendor_external_id = vendor_data['RegistryID'] if 'SourceRecordURL' in vendor_data: lvl.url = vendor_data['SourceRecordURL'] else: continue lvl.save() elif iteration==1: # Third load loads the exp (based on ligand/assay) header = self.header_dict skipped = 0 non_p = [] wr_chembl_assays = WebResource.objects.get(slug='chembl_assays') while count.value<len(self.data): with lock: record = self.data[count.value] count.value +=1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(self.data))) target = record[header['target_chembl_id']] assay_id = record[header['assay_chembl_id']] assay, created = ChemblAssay.objects.get_or_create(assay_id=assay_id) if created: wl, created = WebLink.objects.get_or_create(index=assay_id, web_resource=wr_chembl_assays) assay.web_links.add(wl) ligand =record[header['molecule_chembl_id']] p = Protein.objects.filter(web_links__index = target, web_links__web_resource__slug = 'chembl').first() if not p: if not target in non_p: non_p.append(target) print('Not found protein!',target) continue ls = Ligand.objects.filter(properities__web_links__index=ligand, properities__web_links__web_resource__slug = 'chembl_ligand', canonical=True) if not ls.exists(): # if no ligand matches this, then ignore -- be sure this works later. skipped += 1 continue for l in ls: if len(ls)>1: print('issue with canonical! give to munk',l,l.pk,ligand) break assay_experiments = AssayExperiment.objects.filter( protein=p, ligand=l, assay=assay) if assay_experiments.exists(): assay_experiment = assay_experiments.get() else: assay_experiment = AssayExperiment() assay_experiment.assay = assay assay_experiment.ligand = l assay_experiment.protein = p assay_experiment.assay_type = record[header['assay_type']] assay_experiment.pchembl_value = record[header['pchembl_value']] assay_experiment.assay_description = record[header['assay_description']] assay_experiment.published_value = record[header['published_value']] assay_experiment.published_relation = record[header['published_relation']] assay_experiment.published_type = record[header['published_type']] assay_experiment.published_units = record[header['published_units']] assay_experiment.standard_value = record[header['standard_value']] assay_experiment.standard_relation = record[header['standard_relation']] assay_experiment.standard_type = record[header['standard_type']] assay_experiment.standard_units = record[header['standard_units']] try: assay_experiment.save() except IntegrityError: assay_experiment = AssayExperiment.objects.get( protein=p, ligand=l, assay=assay) print('done, skipped:',skipped)
def main_func(self, positions, iteration,count,lock): missing_proteins = {} c = 0 skipped = 0 rows = self.data_all while count.value<len(rows): with lock: r = rows[count.value] count.value +=1 current = time.time() c += 1 # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) except Exception as msg: print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() elif r['protein'] not in missing_proteins: # Can contain code to try to figure out what protein it is. pass else: missing_proteins[r['protein']] += 1 continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue self.logger.info('Parsed '+str(c)+' bias data entries. Skipped '+str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True) print(missing_proteins)
def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get( web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r[ 'opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, #raw = raw_experiment, #raw_experiment, OR None optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped))
def main_func(self, positions, iteration, count, lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value < len(rows): with lock: r = rows[count.value] count.value += 1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) except Exception as msg: print( 'Something errored with ligand, aborting entry of mutation', r['ligand_name'], r['ligand_type'], r['ligand_id'], r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein = Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein = Protein.objects.filter( web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein = protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml=True) try: real_uniprot = uniprot_protein.find( './/{http://uniprot.org/uniprot}name' ).text.lower() protein = Protein.objects.get( entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error( 'Skipped due to no protein ' + r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter( slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get( slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with", r['ligand_class'], slugify(r['ligand_class'])[:50]) l_role, created = LigandRole.objects.get_or_create( slug=slugify(r['ligand_class']) [:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) except: print(r) typefold = r['exp_type'] + "_log" elif "%" == r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round( r['exp_wt_value'] / r['exp_mu_value_raw'], 3) else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange > 0 and foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group=r['submitting_group'], data_container=r['data_container'], data_container_number=r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange, opt_receptor_expression=r['opt_receptor_expression'], opt_basal_activity=r['opt_basal_activity'], opt_gain_of_activity=r['opt_gain_of_activity'], opt_ligand_emax=r['opt_ligand_emax'], opt_agonist=r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) # current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True)
def add_construct(d): #delete if already name there Construct.objects.filter(name = d['construct_crystal']['pdb_name']).delete() protein = Protein.objects.filter(entry_name=d['construct_crystal']['uniprot']).get() structure = Structure.objects.filter(pdb_code__index=d['construct_crystal']['pdb'].upper()).get() protein_conformation = structure.protein_conformation construct = Construct() construct.protein = protein construct.name = d['construct_crystal']['pdb_name'] construct.json = json.dumps(d, indent=4, separators=(',', ': ')) construct.structure = structure #CrystalInfo crystal = CrystalInfo() crystal.resolution = structure.resolution crystal.pdb_data = structure.pdb_data crystal.pdb_code = structure.pdb_code.index crystal.save() construct.crystal = crystal #Contact INFO if 'contact_info' in d: construct.contributor, created = ContributorInfo.objects.get_or_create(name = d['contact_info']['name_cont'], pi_email = d['contact_info']['pi_email'], pi_name = d['contact_info']['pi_name'], urls = d['contact_info']['url'], date = datetime.datetime.strptime(d['contact_info']['date'], '%m/%d/%Y').strftime('%Y-%m-%d'), address = d['contact_info']['address']) construct.save() #MUTATIONS for mutation in d['mutations']: if 'type' not in mutation: mutation['type'] = '' if 'remark' not in mutation: mutation['remark'] = '' mut = ConstructMutation.objects.create(sequence_number=mutation['pos'],wild_type_amino_acid=mutation['wt'],mutated_amino_acid=mutation['mut'],mutation_type=mutation['type'],remark=mutation['remark']) construct.mutations.add(mut) #DELETIONS insert_deletions = {} for deletion in d['deletions']: if 'start' in deletion: dele = ConstructDeletion.objects.create(start=deletion['start'],end=deletion['end']) else: dele = ConstructDeletion.objects.create(start=deletion['pos'],end=deletion['pos']) construct.deletions.add(dele) if deletion['origin']!='user': id = deletion['origin'].split('_')[1] insert_deletions[id] = deletion #INSERTIONS (AUX) for name,aux in d['auxiliary'].items(): id = name.replace('aux','') aux_type,created = ConstructInsertionType.objects.get_or_create(name=aux['type'],subtype=aux['subtype']) insert = ConstructInsertion.objects.create(insert_type=aux_type,presence=aux['presence'],position=aux['position']+"_"+id) if insert.presence == 'YES' and insert.position.startswith('Within Receptor'): #need to fetch range if 'start' in aux: insert.start = aux['start'] insert.end = aux['start'] else: insert.start = insert_deletions[id]['start'] insert.end = insert_deletions[id]['end'] insert.save() construct.insertions.add(insert) #MODIFICATIONS for modification in d['modifications']: mod = ConstructModification.objects.create(modification=modification['type'],position_type=modification['position'][0], pos_start=modification['position'][1][0], pos_end=modification['position'][1][1],remark=modification['remark'] ) construct.modifications.add(mod) #EXPRESSION if 'expression' in d: if 'expr_method' in d['expression']: if 'expr_remark' not in d['expression']: d['expression']['expr_remark'] = '' construct.expression,created = ExpressionSystem.objects.get_or_create(expression_method=d['expression']['expr_method'], host_cell_type=d['expression']['host_cell_type'], host_cell=d['expression']['host_cell'], remarks=d['expression']['expr_remark']) #solubilization if 'solubilization' in d: if 'deterg_type' in d['solubilization']: c_list = ChemicalList() list_name,created = ChemicalListName.objects.get_or_create(name='Solubilization') c_list.name = list_name c_list.save() ct, created = ChemicalType.objects.get_or_create(name='detergent') chem, created = Chemical.objects.get_or_create(name=d['solubilization']['deterg_type'], chemical_type=ct) cc, created = ChemicalConc.objects.get_or_create(concentration=d['solubilization']['deterg_concentr'], concentration_unit=d['solubilization']['deterg_concentr_unit'], chemical=chem) c_list.chemicals.add(cc) ct, created = ChemicalType.objects.get_or_create(name='additive') chem, created = Chemical.objects.get_or_create(name=d['solubilization']['solub_additive'], chemical_type=ct) cc, created = ChemicalConc.objects.get_or_create(concentration=d['solubilization']['additive_concentr'], concentration_unit=d['solubilization']['addit_concentr_unit'], chemical=chem) c_list.chemicals.add(cc) solubilization = Solubilization.objects.create(chemical_list = c_list) construct.solubilization = solubilization construct.save() #Purification purification = Purification.objects.create() for puri,step in d['solubilization'].items(): if not puri.startswith(('chem_enz_treatment','sol_remark')): continue else: s,created = PurificationStep.objects.get_or_create(name=step) purification.steps.add(s) construct.purification = purification construct.save() #CRYSTALLIZATION if 'crystallization' in d: if 'crystal_type' in d['crystallization']: c = Crystallization() sub_name = "" if 'lcp_lipid' not in d['crystallization'] else d['crystallization']['lcp_lipid'] c_type, created = CrystallizationTypes.objects.get_or_create(name=d['crystallization']['crystal_type'], sub_name=sub_name) c_method, created = CrystallizationMethods.objects.get_or_create(name=d['crystallization']['crystal_method']) c.crystal_type = c_type c.crystal_method = c_method if 'crystal_remark' in d['crystallization']: c.remarks = d['crystallization']['crystal_remark'] c.temp = d['crystallization']['temperature'] if d['crystallization']['ph']=='single_ph': c.ph_start = d['crystallization']['ph_single'] c.ph_end = d['crystallization']['ph_single'] else: c.ph_start = d['crystallization']['ph_range_one'] c.ph_end = d['crystallization']['ph_range_two'] c.protein_conc = d['crystallization']['protein_concentr'] c.protein_conc_unit = d['crystallization']['protein_conc_unit'] c.save() #MAKE LISTS c_list = ChemicalList() list_name,created = ChemicalListName.objects.get_or_create(name='crystallization_chemical_components') c_list.name = list_name c_list.save() for chemical in d['crystallization']['chemical_components']: ct, created = ChemicalType.objects.get_or_create(name='crystallization_chemical_components') chem, created = Chemical.objects.get_or_create(name=chemical['component'], chemical_type=ct) cc, created = ChemicalConc.objects.get_or_create(concentration=chemical['value'], concentration_unit=chemical['unit'], chemical=chem) c_list.chemicals.add(cc) c.chemical_lists.add(c_list) if d['crystallization']['crystal_type']=='lipidic cubic phase': #make list of LCP stuff c_list = ChemicalList() # c_list.name = d['crystallization']['lcp_lipid'] list_name,created = ChemicalListName.objects.get_or_create(name='LCP') c_list.name = list_name c_list.save() ct, created = ChemicalType.objects.get_or_create(name='LCP Lipid additive') chem, created = Chemical.objects.get_or_create(name=d['crystallization']['lcp_add'], chemical_type=ct) cc, created = ChemicalConc.objects.get_or_create(concentration=d['crystallization']['lcp_conc'], concentration_unit=d['crystallization']['lcp_conc_unit'], chemical=chem) c_list.chemicals.add(cc) c.chemical_lists.add(c_list) #DETERGENT if 'detergent' in d['crystallization']: c_list = ChemicalList() list_name,created = ChemicalListName.objects.get_or_create(name='Detergent') c_list.name = list_name c_list.save() ct, created = ChemicalType.objects.get_or_create(name='detergent') chem, created = Chemical.objects.get_or_create(name=d['crystallization']['detergent'], chemical_type=ct) cc, created = ChemicalConc.objects.get_or_create(concentration=d['crystallization']['deterg_conc'], concentration_unit=d['crystallization']['deterg_conc_unit'], chemical=chem) c_list.chemicals.add(cc) c.chemical_lists.add(c_list) #LIPID if 'lipid' in d['crystallization']: c_list = ChemicalList() list_name,created = ChemicalListName.objects.get_or_create(name='Lipid') c_list.name = list_name c_list.save() ct, created = ChemicalType.objects.get_or_create(name='lipid') chem, created = Chemical.objects.get_or_create(name=d['crystallization']['lipid'], chemical_type=ct) cc, created = ChemicalConc.objects.get_or_create(concentration=d['crystallization']['lipid_concentr'], concentration_unit=d['crystallization']['lipid_concentr_unit'], chemical=chem) c_list.chemicals.add(cc) c.chemical_lists.add(c_list) #Use ligand function to get ligand if it exists or otherwise create. Lots of checks for inchi/smiles/name ligand = get_or_make_ligand(d['construct_crystal']['ligand_id'],d['construct_crystal']['ligand_id_type'],d['construct_crystal']['ligand_name']) if 'ligand_activity' not in d['construct_crystal']: d['construct_crystal']['ligand_activity'] = 'unknown' if ligand and 'ligand_activity' in d['construct_crystal']: role_slug = slugify(d['construct_crystal']['ligand_activity']) try: lr, created = LigandRole.objects.get_or_create(slug=role_slug, defaults={'name': d['construct_crystal']['ligand_activity']}) except IntegrityError: lr = LigandRole.objects.get(slug=role_slug) if ligand: ligand_c = CrystallizationLigandConc() ligand_c.construct_crystallization = c ligand_c.ligand = ligand if lr: ligand_c.ligand_role = lr if 'ligand_conc' in d['construct_crystal']: ligand_c.ligand_conc = d['construct_crystal']['ligand_conc'] if 'ligand_conc_unit' in d['construct_crystal']: ligand_c.ligand_conc_unit = d['construct_crystal']['ligand_conc_unit'] ligand_c.save() c.ligands.add(ligand_c) construct.crystallization = c construct.save()
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join([self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:]=='xlsx' or source_file[-3:]=='xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:]=='yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__","_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to'])>1 or len(d['mutation_from'])>1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c%1000==0: self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, raw = raw_experiment, optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) mut_id = obj.id inserted += 1 self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1),reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get(web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, #raw = raw_experiment, #raw_experiment, OR None optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
def import_xtal(self): xtals = self.parse_excel(self.annotation_file,'Xtal-methods') xtals_list = {} for x in xtals: if x[1] in xtals_list: print('pdbcode duplicate?',x[1]) xtals_list[x[1]] = x xtal_chems = self.parse_excel(self.annotation_file,'Xtal-Chemicals') xtals_chems_list = {} for x in xtal_chems: if x[1] not in xtals_chems_list: xtals_chems_list[x[1]] = [] xtals_chems_list[x[1]].append(x) xtal_ligands = self.parse_excel(self.annotation_file,'PDB-ligand-complex') xtal_ligands_list = {} for x in xtal_ligands: if x[1] not in xtal_ligands_list: xtal_ligands_list[x[1]] = [] xtal_ligands_list[x[1]].append(x) missing = list(set(self.all_pdbs) - set(xtals_list.keys())) print(sorted(missing)," do not have any Xtal-methods annotated -- add them to sheet with NONE if they have none") missing = list(set(self.all_pdbs) - set(xtals_chems_list.keys())) print(sorted(missing)," do not have any Xtal-Chemicals annotated -- add them to sheet with NONE if they have none") missing = list(set(self.all_pdbs) - set(xtal_ligands_list.keys())) print(sorted(missing)," do not have any PDB-ligand-complex annotated -- add them to sheet with NONE if they have none") for pdb,x in xtals_list.items(): try: construct = Construct.objects.get(structure__pdb_code__index=pdb.upper()) except: print(pdb,'cannot find pdb construct') continue c = Crystallization() c_type, created = CrystallizationTypes.objects.get_or_create(name=x[3], sub_name=x[4]) c_method, created = CrystallizationMethods.objects.get_or_create(name=x[2]) c.crystal_type = c_type c.crystal_method = c_method c.remarks = x[10] c.temp = x[7] # Some entries have it wrong here try: c.ph_start = float(x[8]) c.ph_end = float(x[9]) except: c.ph_start = 0 c.ph_end = 0 c.protein_conc = x[5] c.protein_conc_unit = x[6] c.save() if pdb in xtals_chems_list: chems = xtals_chems_list[pdb] chem_types = {} for chem in chems: ctype = chem[2] if ctype not in chem_types: chem_types[ctype] = [] chem_types[ctype].append(chem) for ctype,chems in chem_types.items(): c_list = ChemicalList() list_name,created = ChemicalListName.objects.get_or_create(name=ctype) c_list.name = list_name c_list.save() for ch in chems: ct, created = ChemicalType.objects.get_or_create(name=ch[4]) chem, created = Chemical.objects.get_or_create(name=ch[3], chemical_type=ct) cc, created = ChemicalConc.objects.get_or_create(concentration=ch[5], concentration_unit=ch[6], chemical=chem) c_list.chemicals.add(cc) c.chemical_lists.add(c_list) else: print('no chems for ',pdb) if pdb in xtal_ligands_list: l = xtal_ligands_list[pdb][0] ligand = get_or_make_ligand(l[7],l[6],l[2]) role_slug = slugify(l[3]) try: lr, created = LigandRole.objects.get_or_create(slug=role_slug, defaults={'name': l[3]}) except IntegrityError: lr = LigandRole.objects.get(slug=role_slug) if ligand: ligand_c = CrystallizationLigandConc() ligand_c.construct_crystallization = c ligand_c.ligand = ligand if lr: ligand_c.ligand_role = lr if l[4]: ligand_c.ligand_conc = l[4] if l[5]: ligand_c.ligand_conc_unit = l[5] ligand_c.save() c.ligands.add(ligand_c) construct.crystallization = c construct.save()
def main_func(self, positions, iteration,count,lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value<len(rows): with lock: r = rows[count.value] count.value +=1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) except Exception as msg: print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein=Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein=protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True) try: real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower() protein=Protein.objects.get(entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error('Skipped due to no protein '+ r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] ) l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); except: print(r) typefold = r['exp_type']+"_log" elif "%"==r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3); else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange>0 and foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group = r['submitting_group'], data_container = r['data_container'], data_container_number = r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange, opt_receptor_expression = r['opt_receptor_expression'], opt_basal_activity = r['opt_basal_activity'], opt_gain_of_activity = r['opt_gain_of_activity'], opt_ligand_emax = r['opt_ligand_emax'], opt_agonist = r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)