def load_by_gtop_id(self, ligand_name, gtop_id, ligand_type): logger = logging.getLogger('build') # get the data from cache or web services cache_dir = ['guidetopharmacology', 'ligands'] url = 'http://www.guidetopharmacology.org/services/ligands/$index' gtop = fetch_from_web_api(url, gtop_id, cache_dir) if gtop: # get name from response ligand_name = gtop['name'] # does a ligand by this name already exists? try: existing_ligand = Ligand.objects.get(name=ligand_name, canonical=True) return existing_ligand except Ligand.DoesNotExist: web_resource = False if gtop_id: # gtoplig webresource web_resource = WebResource.objects.get(slug='gtoplig') return self.update_ligand(ligand_name, {}, ligand_type, web_resource, gtop_id)
def main_func(self, positions, iteration,count,lock): while count.value<len(self.structures): with lock: s = self.structures[count.value] count.value +=1 self.logger.info('Generating DSSP data for \'{}\'... ({} out of {})'.format(s, count.value, len(self.structures))) print(s) pdbcode = s.pdb_code.index.lower() chain = s.preferred_chain # Grab DSSP db index number url = 'http://mrs.cmbi.ru.nl/search?db=dssp&q=%s&count=3' % (pdbcode) r = request.urlopen(url) t = r.geturl() d_id = t.split('=')[2][:-3] # Grab DSSP file url = 'http://mrs.cmbi.ru.nl/download?db=dssp&nr=$index' cache_dir = ['dssp', 'id'] dssp = fetch_from_web_api(url, d_id, cache_dir, raw=True) # Parse file dssp = self.dssp_dict(dssp,chain) rs = Residue.objects.filter(protein_conformation=s.protein_conformation).all() for r in rs: if r.sequence_number in dssp: point, created = ResidueDataPoint.objects.get_or_create(data_type=self.dssp_type, residue=r, value_text=dssp[r.sequence_number])
def load_by_gtop_id(self, ligand_name, gtop_id, ligand_type): logger = logging.getLogger('build') # get the data from cache or web services cache_dir = ['guidetopharmacology', 'ligands'] url = 'http://www.guidetopharmacology.org/services/ligands/$index' gtop = fetch_from_web_api(url, gtop_id, cache_dir) if gtop: # get name from response ligand_name = gtop['name'] if ligand_name=='11-<i>cis</i>-retinal': ligand_name = 'retinal' # does a ligand by this name already exists? try: existing_ligand = Ligand.objects.get(name=ligand_name, canonical=True) return existing_ligand except Ligand.DoesNotExist: web_resource = False if gtop_id: # gtoplig webresource web_resource = WebResource.objects.get(slug='gtoplig') return self.update_ligand(ligand_name, {}, ligand_type, web_resource, gtop_id)
def find_cid_for_chembl(self, chembl_mol_id): # function to find cid based on chembl cache_dir = ['ebi', 'chembl', 'src_compound_id_all'] url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/$index/1/22' lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir) # print("Searching for ",chembl_mol_id) not_found = False cid = False temp = [] if not lig_data: #if not successful not_found = True else: try: for i, x in enumerate(lig_data): temp.append(lig_data[i]['src_compound_id']) if len(temp) > 1: cid = ';'.join(temp) self.add_cid_to_dict(chembl_mol_id,cid) elif len(temp) == 1 and temp[0] != '\n': cid = temp[0] #lig_data[0]['src_compound_id'] self.add_cid_to_dict(chembl_mol_id,cid) else: not_found = True #print (chembl_mol_id) except KeyError: not_found = True if not cid: # not found cache_dir = ['pubchem', 'chembl', 'compound_name'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/$index/json' lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir) if lig_data: try: cid = lig_data['PC_Compounds'][0]['id']['id']['cid'] self.add_cid_to_dict(chembl_mol_id,cid) not_found = False except KeyError: not_found = True return cid,not_found
def find_cid(self, chembl_mol_ids, chembl_cid_dict): notfound = set() for chembl_mol_id in chembl_mol_ids: if chembl_mol_id not in chembl_cid_dict.keys(): temp = [] # url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/'+chembl_mol_id+'/1/22' # response = requests.get(url) # lig_data = response.json() cache_dir = ['ebi', 'chembl', 'src_compound_id_all'] url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/$index/1/22' lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir) # print("Searching for ",chembl_mol_id,len(chembl_mol_ids)) if not lig_data: #if not successful notfound.add(chembl_mol_id) continue try: for i, x in enumerate(lig_data): temp.append(lig_data[i]['src_compound_id']) if len(temp) > 1: cid = ';'.join(temp) chembl_cid_dict[chembl_mol_id] = cid self.add_cid_to_dict(chembl_mol_id, cid) elif len(temp) == 1 and temp[0] != '\n': cid = temp[0] #lig_data[0]['src_compound_id'] #updating the existing dictionary chembl_cid_dict[chembl_mol_id] = cid self.add_cid_to_dict(chembl_mol_id, cid) else: notfound.add(chembl_mol_id) #print (chembl_mol_id) except KeyError: notfound.add(chembl_mol_id) #print (chembl_mol_id) elif chembl_mol_id in chembl_cid_dict: continue else: #raise KeyError: print( chembl_mol_id) #to do put it to logfile where it should be return chembl_cid_dict, notfound # to do perhaps a redundant to have found and chembl_cid_dict.
def find_cid(self, chembl_mol_ids, chembl_cid_dict): notfound = set() for chembl_mol_id in chembl_mol_ids: if chembl_mol_id not in chembl_cid_dict.keys(): temp = [] # url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/'+chembl_mol_id+'/1/22' # response = requests.get(url) # lig_data = response.json() cache_dir = ['ebi', 'chembl', 'src_compound_id_all'] url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/$index/1/22' lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir) # print("Searching for ",chembl_mol_id,len(chembl_mol_ids)) if not lig_data: #if not successful notfound.add(chembl_mol_id) continue try: for i, x in enumerate(lig_data): temp.append(lig_data[i]['src_compound_id']) if len(temp) > 1: cid = ';'.join(temp) chembl_cid_dict[chembl_mol_id] = cid self.add_cid_to_dict(chembl_mol_id,cid) elif len(temp) == 1 and temp[0] != '\n': cid = temp[0] #lig_data[0]['src_compound_id'] #updating the existing dictionary chembl_cid_dict[chembl_mol_id] = cid self.add_cid_to_dict(chembl_mol_id,cid) else: notfound.add(chembl_mol_id) #print (chembl_mol_id) except KeyError: notfound.add(chembl_mol_id) #print (chembl_mol_id) elif chembl_mol_id in chembl_cid_dict: continue else: #raise KeyError: print (chembl_mol_id) #to do put it to logfile where it should be return chembl_cid_dict, notfound # to do perhaps a redundant to have found and chembl_cid_dict.
def main_func(self, positions, iteration, count, lock): while count.value < len(self.structures): with lock: s = self.structures[count.value] count.value += 1 self.logger.info( 'Generating DSSP data for \'{}\'... ({} out of {})'.format( s, count.value, len(self.structures))) print(s) pdbcode = s.pdb_code.index.lower() chain = s.preferred_chain # Grab DSSP db index number url = 'http://mrs.cmbi.ru.nl/search?db=dssp&q=%s&count=3' % ( pdbcode) r = request.urlopen(url) t = r.geturl() d_id = t.split('=')[2][:-3] # Grab DSSP file url = 'http://mrs.cmbi.ru.nl/download?db=dssp&nr=$index' cache_dir = ['dssp', 'id'] dssp = fetch_from_web_api(url, d_id, cache_dir, raw=True) # Parse file dssp = self.dssp_dict(dssp, chain) rs = Residue.objects.filter( protein_conformation=s.protein_conformation).all() for r in rs: if r.sequence_number in dssp: point, created = ResidueDataPoint.objects.get_or_create( data_type=self.dssp_type, residue=r, value_text=dssp[r.sequence_number])
def main_func(self, positions, iteration,count,lock): #####Create chembl compound link and connect it to the corresponding ligand/cid##### if iteration==0: # First load makes sure ligands are there list_of_chembl_ids = self.chembl_mol_ids while count.value<len(list_of_chembl_ids): with lock: chembl_ligand = list_of_chembl_ids[count.value] count.value +=1 if count.value % 1000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(list_of_chembl_ids))) l = Ligand.objects.filter(properities__web_links__web_resource__slug = 'chembl_ligand', properities__web_links__index=chembl_ligand).first() if l: cid = l.properities.web_links.filter(web_resource__slug = 'pubchem').first() if cid: cid = cid.index else: l = None # make sure code blow is run if not l: # if l already has chembl link, assume all is good. if chembl_ligand not in self.chembl_cid_dict.keys(): cids, not_found = self.find_cid_for_chembl(chembl_ligand) if not_found: print('SKIPPED: Could not determine CID',chembl_ligand,cids) continue else: cids = self.chembl_cid_dict[chembl_ligand] temp = str(cids).split(';') #perhaps we should load all of the CIDs cid = str(temp[0]) l = get_or_make_ligand(cid,'PubChem CID') #call the first cid if there are more than one if not l: print('SKIPPED: Ligand not found in PubChem', cid) continue if not l.properities.web_links.filter(web_resource__slug = 'pubchem',index = cid).exists(): # NO CID FOR LIGAND! Rare cases where SMILES was used for initial look up wl, created = WebLink.objects.get_or_create(index=cid, web_resource=self.wr_pubchem) l.properities.web_links.add(wl) if not l.properities.web_links.filter(web_resource__slug = 'chembl_ligand',index = chembl_ligand).exists(): wl, created = WebLink.objects.get_or_create(index=chembl_ligand, web_resource=self.wr) l.properities.web_links.add(wl) ###### Vendor stuff ###### if not len(l.properities.vendors.all()): # If it has some, assume they are all loaded cache_dir = ['pubchem', 'cid', 'vendors'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/categories/compound/$index/JSON/' vendors = fetch_from_web_api(url, cid, cache_dir) if vendors: for vendor_data in vendors['SourceCategories']['Categories'][0]['Sources'] : lv, created = LigandVendors.objects.get_or_create(slug = slugify(vendor_data['SourceName'])) lv.name = vendor_data['SourceName'] if 'SourceURL' in vendor_data: lv.url = vendor_data['SourceURL'] lv.save() if 'SID' in vendor_data: #print (vendor_data['SID']) lvls = LigandVendorLink.objects.filter(sid = vendor_data['SID'] ) if not lvls.exists(): lvl = LigandVendorLink() lvl.vendor = lv lvl.lp = l.properities lvl.sid = vendor_data['SID'] if 'RegistryID' in vendor_data: lvl.vendor_external_id = vendor_data['RegistryID'] if 'SourceRecordURL' in vendor_data: lvl.url = vendor_data['SourceRecordURL'] else: continue lvl.save() elif iteration==1: # Third load loads the exp (based on ligand/assay) header = self.header_dict skipped = 0 non_p = [] wr_chembl_assays = WebResource.objects.get(slug='chembl_assays') while count.value<len(self.data): with lock: record = self.data[count.value] count.value +=1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(self.data))) target = record[header['target_chembl_id']] assay_id = record[header['assay_chembl_id']] assay, created = ChemblAssay.objects.get_or_create(assay_id=assay_id) if created: wl, created = WebLink.objects.get_or_create(index=assay_id, web_resource=wr_chembl_assays) assay.web_links.add(wl) ligand =record[header['molecule_chembl_id']] p = Protein.objects.filter(web_links__index = target, web_links__web_resource__slug = 'chembl').first() if not p: if not target in non_p: non_p.append(target) print('Not found protein!',target) continue ls = Ligand.objects.filter(properities__web_links__index=ligand, properities__web_links__web_resource__slug = 'chembl_ligand', canonical=True) if not ls.exists(): # if no ligand matches this, then ignore -- be sure this works later. skipped += 1 continue for l in ls: if len(ls)>1: print('issue with canonical! give to munk',l,l.pk,ligand) break assay_experiments = AssayExperiment.objects.filter( protein=p, ligand=l, assay=assay) if assay_experiments.exists(): assay_experiment = assay_experiments.get() else: assay_experiment = AssayExperiment() assay_experiment.assay = assay assay_experiment.ligand = l assay_experiment.protein = p assay_experiment.assay_type = record[header['assay_type']] assay_experiment.pchembl_value = record[header['pchembl_value']] assay_experiment.assay_description = record[header['assay_description']] assay_experiment.published_value = record[header['published_value']] assay_experiment.published_relation = record[header['published_relation']] assay_experiment.published_type = record[header['published_type']] assay_experiment.published_units = record[header['published_units']] assay_experiment.standard_value = record[header['standard_value']] assay_experiment.standard_relation = record[header['standard_relation']] assay_experiment.standard_type = record[header['standard_type']] assay_experiment.standard_units = record[header['standard_units']] try: assay_experiment.save() except IntegrityError: assay_experiment = AssayExperiment.objects.get( protein=p, ligand=l, assay=assay) print('done, skipped:',skipped)
def handle(self, *args, **options): cache_dir = ['ensembl2', 'isoform'] url = 'https://rest.ensembl.org/sequence/id/$index?content-type=application/json&type=protein' #Altenrative url = 'https://grch37.rest.ensembl.org/sequence/id/$index?db_type=core;object_type=predictiontranscript;content-type=application/json;species=homo_sapiens;type=protein' url = 'https://grch37.rest.ensembl.org/sequence/id/$index?type=protein;content-type=application/json' filepath = 'protein/data/Isoform_annotation_table.txt' isoforms = [] with open(filepath, "r", encoding='UTF-8') as f: for row in f: c = row.split("\t") isoforms.append(c) # Skip header total_matches = 0 total_mismatches = 0 total_mismatches_1 = 0 total_align_match = 0 total_align_mismatch = 0 isoforms_with_issue = {} dump = {} for c, i in enumerate(isoforms[1:]): p = '{}_human'.format(i[0].lower()) print(p) protein = Protein.objects.get(entry_name=p, sequence_type__slug='wt', species__common_name='Human') wt_seq = protein.sequence rs = Residue.objects.filter( protein_conformation__protein=protein).prefetch_related( 'protein_segment', 'display_generic_number', 'generic_number') r_lookup = {} r_segment = {} for r in rs: r_lookup[r.sequence_number] = [ r.protein_segment.slug, str(r.display_generic_number), r.sequence_number ] if r.protein_segment.slug not in r_segment: r_segment[r.protein_segment.slug] = 0 r_segment[r.protein_segment.slug] += 1 seq_filename = "protein/data/MSA_GPCR_isoforms/{}_isoform_MSA.fa".format( p.lower()) with open(seq_filename, "r") as myfile: fasta_raw = myfile.read() fasta = fasta_raw.splitlines() wt_seq2 = fasta[1] es = i[3].split(", ") isoform_id = i[1] print(c, len(isoforms), p, isoform_id, es) wt_check = wt_seq == wt_seq2.replace("-", "") if not wt_check: print(p, 'WT SEQ NO MATCH!!') # continue # print('WT SEQ',wt_seq==wt_seq2.replace("-","")) ranges = {} for e in es: iso_seq_msa = fasta[1 + int(isoform_id) * 2] iso_seq_msa_corrected = '' for pos, a in enumerate(iso_seq_msa): if wt_seq2[pos] == '-' and a == '-': continue iso_seq_msa_corrected += a isoform_info = fetch_from_web_api(url, e, cache_dir) if (isoform_info): iso_seq = isoform_info['seq'] iso_check = iso_seq == iso_seq_msa.replace("-", "") if not iso_check: isoforms_with_issue[ p + "_" + e] = "Sequence does not match with API" # print("E_ID:", e, " SEQUENCE DO NOT MATCH") # print("API:",iso_seq) # print("MSA:",iso_seq_msa.replace("-","")) total_mismatches += 1 if iso_seq == iso_seq_msa.replace("-", "")[:-1]: total_mismatches_1 += 1 else: total_matches += 1 pw2 = pairwise2.align.globalms(wt_seq, iso_seq, 2, -5, -10, -.5) aln_ref = pw2[0][0] aln_isoform = pw2[0][1] if aln_isoform != iso_seq_msa_corrected: isoforms_with_issue[ p + "_" + e] = "Alignment differs than pairwise, see alignment for sanity" total_align_mismatch += 1 # print('misalign') # print(aln_isoform) # print(iso_seq_msa_corrected) else: total_align_match += 1 gaps = 0 gaps_iso = 0 missing_pos = [] missing_pos_iso = [] res_correct = {} isoform_missing_segment = {} count_segment = {} # print("length",len(aln_ref),len(aln_isoform)) for i, r in enumerate(aln_ref, 1): if aln_isoform[i - 1] == '-': gaps_iso += 1 if r == "-": res_correct[i] = ['', '', ''] gaps += 1 if aln_isoform[i - 1] != "-": # Ref is missing missing_pos_iso.append(i - gaps_iso) if i - gaps == 0: # Take N-term if it's begining isoform_missing_segment[ i - gaps_iso] = r_lookup[1][0] else: isoform_missing_segment[i - gaps_iso] = ( i - gaps, r_lookup[i - gaps][0]) else: res_correct[i] = aln_ref[i - gaps - 1] if aln_isoform[i - 1] == "-": # Ref is missing missing_pos.append(i - gaps) else: segment = r_lookup[i - gaps][0] if segment not in count_segment: count_segment[segment] = 0 count_segment[segment] += 1 result_segment = {} for segment, value in r_segment.items(): if segment in count_segment: freq = round(count_segment[segment] / value, 2) count = count_segment[segment] else: freq = 0 count = 0 if freq != 1: # If incomplete segment, save it result_segment[segment] = [freq, count, value] # print(result_segment) # print(missing_pos,missing_pos_iso) ranges = {} ranges['deleted_ref'] = [] ranges['inserts'] = [] ranges['segments_altered'] = result_segment for k, g in groupby(enumerate(missing_pos), lambda x: x[0] - x[1]): group = list(map(itemgetter(1), g)) # What was the previous postions slug from_segment = r_lookup[group[0]][0] to_segment = r_lookup[group[-1]][0] ranges['deleted_ref'].append({ 'from': [group[0], from_segment], 'to': [group[-1], to_segment], 'length': len(group) }) for k, g in groupby(enumerate(missing_pos_iso), lambda x: x[0] - x[1]): group = list(map(itemgetter(1), g)) inserted_into = isoform_missing_segment[group[0]] ranges['inserts'].append({ 'from': group[0], 'to': group[-1], 'inserted_into': inserted_into, 'length': len(group) }) # print(ranges) else: print(e, 'no info') key = '{}_{}'.format(p, isoform_id) dump[key] = ranges dump[key]['e_ids'] = es #print(dump) f = open('protein/data/isoforms.json', 'w') json.dump(dump, f, indent=4, separators=(',', ': ')) print("SUMMARY") print("TOTAL MATCHES of isoform seq", total_matches) print("ALIGNMENT MATCH", total_align_match, "MISMATCH", total_align_mismatch) print("TOTAL MISMATCHES of isoform seq", total_mismatches) print("TOTAL MISMATCHES of isoform seq (MSA has one extra)", total_mismatches_1) for e, r in isoforms_with_issue.items(): print(e, r)
def update_from_doi(self, doi): logger = logging.getLogger('build') # should entrez be tried as a backup? try_entrez_on_fail = False # check whether this data is cached cache_dir = ['crossref', 'doi'] url = 'http://api.crossref.org/works/$index' pub = fetch_from_web_api(url, doi, cache_dir) if pub: # update record try: self.title = pub['message']['title'][0] try: self.year = pub['message']['created']['date-parts'][0][0] except: self.year = pub['message']['deposited']['date-parts'][0][0] # go from [{'family': 'Gloriam', 'given': 'David E.'}] to ['Gloriam DE'] authors = [ '{} {}'.format( x['family'], ''.join([y[:1] for y in x['given'].split()])) for x in pub['message']['author'] ] self.authors = ', '.join(authors) # get volume and pages if available reference = {} fields = ['volume', 'page'] for f in fields: if f in pub['message']: reference[f] = pub['message'][f] else: reference[f] = 'X' self.reference = '{}:{}'.format(reference['volume'], reference['page']) # journal journal = pub['message']['container-title'][0] try: # not all records have the journal abbreviation journal_abbr = pub['message']['container-title'][1] except: journal_abbr = slugify(journal) try: self.journal, created = PublicationJournal.objects.get_or_create( name=journal, defaults={'slug': journal_abbr}) if created: logger.info('Created journal {}'.format(journal)) except IntegrityError: self.journal = PublicationJournal.objects.get(name=journal) except Exception as msg: logger.warning( 'Processing data from CrossRef for {} failed: {}'.format( doi, msg)) try_entrez_on_fail = False else: print("Publication not on crossref", doi) try_entrez_on_fail = False if try_entrez_on_fail: # try searching entrez for DOI try: Entrez.email = '*****@*****.**' record = Entrez.read( Entrez.esearch(db='pubmed', retmax=1, term=doi)) self.update_from_pubmed_data(record['IdList'][0]) except: return False
def fetch_pdb_info(pdbname,protein): logger = logging.getLogger('build') #d = {} d = OrderedDict() d['construct_crystal'] = {} d['construct_crystal']['pdb'] = pdbname d['construct_crystal']['pdb_name'] = 'auto_'+pdbname d['construct_crystal']['uniprot'] = protein.parent.entry_name d['contact_info'] = {} d['contact_info']['name_cont'] = 'gpcrdb' d['contact_info']['pi_email'] = '*****@*****.**' d['contact_info']['pi_name'] = 'gpcrdb' d['contact_info']['url'] = 'gpcrdb.org' d['contact_info']['date'] = time.strftime('%m/%d/%Y') d['contact_info']['address'] = '' d['protein'] = protein.parent.name d['wt_seq'] = protein.parent.sequence d['pdb'] = pdbname d['links'] = [] d['xml_not_observed'] = [] d['xml_segments'] = [] pos_in_wt = list(range(1,len(d['wt_seq'])+1)) #http://files.gpcrdb.org/uniprot_mapping.txt ## get uniprot to name mapping url = 'http://files.gpcrdb.org/uniprot_mapping.txt' req = urlopen(url) uniprot_mapping = req.read().decode('UTF-8') rows = ( line.split(' ') for line in uniprot_mapping.split('\n') ) uniprot_mapping = { row[0]:row[1:] for row in rows } #errors, fix it. uniprot_mapping['P08483'] = ['acm3_rat'] uniprot_mapping['P42866'] = ['oprm_mouse'] #ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/1xyz.xml.gz cache_dir = ['sifts', 'xml'] url = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/$index.xml.gz' sifts = fetch_from_web_api(url, pdbname.lower(), cache_dir, xml = True) d['links'].append(Template(url).substitute(index=quote(str(pdbname.lower()), safe=''))) d['mutations'] = [] d['auxiliary'] = OrderedDict() receptor_seq_ids = [] receptor_chain = '' if sifts: #success # print(sifts) insert_position = 'N-term' insert_start = 0 msg_1 = 0 msg_2 = 0 # for elem in sifts: # print(elem) for elem in sifts.findall('.//{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}segment'): receptor = False chain = elem.attrib['segId'].split('_')[1] for res in elem[0]: #first element is residuelist if receptor_chain!='': break #break if found for node in res: if node.tag == '{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb': source = node.attrib['dbSource'] if source=='UniProt': u_id = node.attrib['dbAccessionId'] if u_id in uniprot_mapping: receptor_chain = chain break for elem in sifts.findall('.//{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}segment'): # print(elem.attrib) if 'segId' not in elem.attrib: continue #not receptor seg_uniprot_ids = [] max_pos = 0 min_pos = 99999 pos_list = [] uniprot_pos = None receptor = False u_id_source = 'N/A' chain = elem.attrib['segId'].split('_')[1] seg_resid_list = [] # print(chain,'chain') for res in elem[0]: #first element is residuelist u_id = 'N/A' pdb_aa = '' for node in res: if node.tag == '{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb': source = node.attrib['dbSource'] if source=='UniProt': u_id = node.attrib['dbAccessionId'] u_id_source = 'UniProt' if u_id in uniprot_mapping: u_id = uniprot_mapping[u_id][0] receptor = True ## this is receptor element if receptor_chain=='' or receptor_chain==chain: receptor_chain = chain elif msg_1==0: msg_1 = 1 # print('\t', pdbname.lower(),'receptor in many chains?!',chain,receptor_chain) logger.warning('{} has receptor in many chains {} {}'.format(pdbname.lower(),chain,receptor_chain)) insert_position = 'Within Receptor' if u_id not in seg_uniprot_ids: seg_uniprot_ids.append(u_id) uniprot_pos = int(node.attrib['dbResNum']) uniprot_aa = node.attrib['dbResName'] elif source=='PDB' and node.attrib['dbResNum'].isdigit(): #use instead of isinstance(node.attrib['dbResNum'], int): pos = int(node.attrib['dbResNum']) try: pdb_aa = AA_three[node.attrib['dbResName'].upper()] except: pdb_aa = "X" if receptor: receptor_seq_ids.append(pos) seg_resid_list.append(pos) if pos>max_pos: max_pos = pos if pos<min_pos: min_pos = pos elif pdb_aa and node.tag == '{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}residueDetail': #print(node.attrib['property'],node.text) if node.text=='Not_Observed' and receptor: d['xml_not_observed'].append(uniprot_pos) elif node.attrib['property']=='Annotation' and u_id=='N/A': u_id = node.text if u_id not in seg_uniprot_ids: seg_uniprot_ids.append(u_id) elif receptor and node.attrib['property']=='Annotation' and node.text == 'Engineered mutation': ## only in receptor if {'mut':pdb_aa,'wt':uniprot_aa,'pos':uniprot_pos,'type':''} not in d['mutations']: #prevent duplicates d['mutations'].append({'mut':pdb_aa,'wt':uniprot_aa,'pos':uniprot_pos,'type':''}) if uniprot_pos: pos_list.append(uniprot_pos) if receptor and uniprot_pos in pos_in_wt: pos_in_wt.remove(uniprot_pos) insert_start = str(uniprot_pos+1) elif receptor: # print('wierd error with position already deleted',uniprot_pos) pass ranges = [] for k, g in groupby(enumerate(pos_list), lambda x:x[0]-x[1]): group = list(map(itemgetter(1), g)) ranges.append((group[0], group[-1])) if receptor==False and u_id_source=='UniProt': url = 'http://www.uniprot.org/uniprot/$index.xml' insert_info = fetch_from_web_api(url, seg_uniprot_ids[0], cache_dir, xml = True) d['links'].append(Template(url).substitute(index=quote(str(seg_uniprot_ids[0]), safe=''))) for elm in insert_info.findall('.//{http://uniprot.org/uniprot}recommendedName'): seg_uniprot_ids[0] = elm.find('{http://uniprot.org/uniprot}fullName').text d['xml_segments'].append([elem.attrib['segId'],seg_uniprot_ids,min_pos,max_pos,ranges,insert_position,seg_resid_list]) if receptor == False and receptor_chain==chain: #not receptor, but is in same chain if len(seg_uniprot_ids): subtype =seg_uniprot_ids[0] else: subtype ='N/A' continue #do not add segments without information if subtype == 'Not_Observed': continue #ignore "aux" that are 'not observed' if subtype == 'Engineered mutation': continue #ignore "aux" that are 'not observed' if subtype == 'S-arrestin': continue # S-arrestin is not part of the chain d['auxiliary']['aux'+str(len(d['auxiliary']))] = {'type':'auto','subtype':subtype,'presence':'YES','position':insert_position, 'start':insert_start} elif receptor == False: # print('\t',pdbname.lower(),'Protein in PDB, not part of receptor chain',seg_uniprot_ids,'chain',chain) logger.warning('{} Protein in structure, but not part of receptor chain {} {}'.format(pdbname.lower(),seg_uniprot_ids,chain)) d['deletions'] = [] for k, g in groupby(enumerate(pos_in_wt), lambda x:x[0]-x[1]): group = list(map(itemgetter(1), g)) d['deletions'].append({'start':group[0], 'end':group[-1], 'origin':'user'}) d['not_observed'] = [] for k, g in groupby(enumerate(d['xml_not_observed']), lambda x:x[0]-x[1]): group = list(map(itemgetter(1), g)) d['not_observed'].append((group[0], group[-1])) else: pass # print('failed sifts') #http://www.ebi.ac.uk/pdbe/api/pdb/entry/experiment/2RH1 ## experiment data cache_dir = ['pdbe', 'experiment'] url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/experiment/$index' pdbe = fetch_from_web_api(url, pdbname, cache_dir) d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe=''))) if pdbe: #success r = pdbe[pdbname.lower()][0] d['resolution'] = r.get('resolution') d['crystal_growth'] = r.get('crystal_growth') d['r_factor'] = r.get('r_factor') d['experimental_method'] = r.get('experimental_method') else: pass # print('failed pdbe') # #http://www.ebi.ac.uk/pdbe/api/pdb/entry/modified_AA_or_NA/2RH1 # ## modified AA (empty on 2RH1) # cache_dir = ['pdbe', 'modified_AA_or_NA'] # url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/modified_AA_or_NA/$index' # pdbe_mod = fetch_from_web_api(url, pdbname, cache_dir) # d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe=''))) # if pdbe_mod: #success # print(pdbe_mod) # else: # d['modifications3'] = 'None' # print('failed pdbe_mod') #http://www.rcsb.org/pdb/explore/jmol.do?structureId=4LDO&json=true ## modifications for their jmol -- "hacky" way to get it cache_dir = ['rcsb', 'jmol_modifications'] url = 'http://www.rcsb.org/pdb/explore/jmol.do?structureId=$index&json=true' rcsb_mod = fetch_from_web_api(url, pdbname, cache_dir) d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe=''))) # print(Template(url).substitute(index=quote(str(pdbname), safe=''))) if rcsb_mod: #success d['modifications'] = [] d['modifications2'] = rcsb_mod # print(receptor_seq_ids) for mod in rcsb_mod['protmod']['domains']: t = mod['range'].split(',') if t[0].split(':')[1]!=receptor_chain: # print('modification not in receptor chain, not interested') continue if len(t)>1: position_type = 'pair' position_info = [t[0].split(':')[0],t[1].split(':')[0]] elif len(t)==1: position_type = 'single' position_info = [t[0].split(':')[0],0] else: print('error',t) continue # print(mod['id'],pair,mod['description']) if mod['id']=='crosslink2': mod['id']="Disulfide bond" #replace non-descript crosslink2 d['modifications'].append({'position':[position_type,position_info],'type':mod['id'],'remark':mod['description']}) #{{v.id}} {{v.range}} {{v.description}} {{v.pdbCcId}} <br><br> else: d['modifications2'] = 'None' # print('failed pdbe_mod') #http://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/2RH1 cache_dir = ['pdbe', 'ligands'] url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/$index' pdbe_ligands = fetch_from_web_api(url, pdbname, cache_dir) d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe=''))) # print(Template(url).substitute(index=quote(str(pdbname), safe=''))) if pdbe_ligands: #success d['ligands'] = {} for name,pdb in pdbe_ligands.items(): for ligand in pdb: if ligand['chem_comp_id'] not in d['ligands']: d['ligands'][ligand['chem_comp_id']] = {'comp_name':ligand['chem_comp_name'], 'number_of_entries':1} else: d['ligands'][ligand['chem_comp_id']]['number_of_entries'] += 1 # print(d['ligands']) else: d['ligands'] = 'None' # print('failed pdbe_ligands') ## NOT NEED - FETCH MUT FROM XML # #http://www.ebi.ac.uk/pdbe/api/pdb/entry/mutated_AA_or_NA/2RH1 # ## mutated AA # ### got conflicts, engerineered mutation and expression tag examples # cache_dir = ['pdbe', 'mutated_AA_or_NA'] # url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/mutated_AA_or_NA/$index' # pdbe_mut = fetch_from_web_api(url, pdbname, cache_dir) # d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe=''))) # if pdbe_mut: #success # r = pdbe_mut[pdbname.lower()] # d['mutations_pdbe'] = [] # for mut in r: # mut_from = mut['mutation_details']['from'] # mut_to = mut['mutation_details']['to'] # mut_type = mut['mutation_details']['type'] # construct_seq_number = mut['residue_number'] # wt_seq_number = mut['author_residue_number'] # t = {'wt':mut_from,'mut':mut_to,'type':mut_type,'c_seq_nr':construct_seq_number,'pos':wt_seq_number} # d['mutations_pdbe'].append(t) # else: # print('failed pdbe_mut') #http://www.rcsb.org/pdb/rest/das/pdb_uniprot_mapping/alignment?query=2RH1 ## uniprot mappings ### seems to be IDs of stuff then use: # http://www.uniprot.org/uniprot/P00720.xml cache_dir = ['rcsb', 'pdb_uniprot_mapping'] url = 'http://www.rcsb.org/pdb/rest/das/pdb_uniprot_mapping/alignment?query=$index' uniprot_map = fetch_from_web_api(url, pdbname, cache_dir, xml = True) d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe=''))) if uniprot_map: #success inserts = {} inserts_fixed = {} for block in uniprot_map[0]: if block.tag[-5:]!='block': continue #only interested in the blocks... i = 0 for segment in block: if i==0: construct_range = [segment.attrib['start'],segment.attrib['end']] else: insert_range = [segment.attrib['start'],segment.attrib['end']] insert_id = segment.attrib['intObjectId'] prev_block = segment i += 1 i = inserts.setdefault(insert_id, []) i.append({'c':construct_range,'i':insert_range}) for insert,blocks in inserts.items(): if insert in uniprot_mapping: insert = uniprot_mapping[insert][0] inserts_fixed[insert] = {} cache_dir = ['uniprot', 'id'] url = 'http://www.uniprot.org/uniprot/$index.xml' insert_info = fetch_from_web_api(url, insert, cache_dir, xml = True) d['links'].append(Template(url).substitute(index=quote(str(insert), safe=''))) for elm in insert_info.findall('.//{http://uniprot.org/uniprot}recommendedName'): inserts_fixed[insert]['alt_name'] = elm.find('{http://uniprot.org/uniprot}fullName').text # print(insert_info.findall('.//.')) blocks_num = len(blocks) prev_block = None temp = [] for i, b in enumerate(blocks): #for each block, to glue them together if i==0: start = [b['i'][0],b['c'][0]] end = [b['i'][1],b['c'][1]] # print(i,b) if i<blocks_num-1: #if not last # print('cur',b,'next',blocks[i+1]) if int(b['i'][1])==int(blocks[i+1]['i'][0])-1 and int(b['c'][1])==int(blocks[i+1]['c'][0])-1: #if insert is a contination #if construct continues end = [blocks[i+1]['i'][1],blocks[i+1]['c'][1]] else: #gap temp.append({'i_start':start[0],'i_end':end[0],'c_start':start[1],'c_end':end[1]}) # temp.append([start,end]) start = [blocks[i+1]['i'][0],blocks[i+1]['c'][0]] end = [blocks[i+1]['i'][1],blocks[i+1]['c'][1]] temp.append({'i_start':start[0],'i_end':end[0],'c_start':start[1],'c_end':end[1]}) i = inserts_fixed[insert].setdefault('positions', []) i.append(temp) d['inserts'] = inserts_fixed else: pass # print('failed uniprot_map') return d
def load_from_pubchem(self, lookup_type, pubchem_id, ligand_type, ligand_title=False): logger = logging.getLogger('build') # if ligand title is specified, use that as the name if ligand_title: ligand_name = ligand_title # otherwise, fetch ligand name from pubchem else: # check cache cache_dir = ['pubchem', 'cid', 'synonyms'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format(lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) ##print (pubchem) # get name from response try: ligand_name = pubchem['InformationList']['Information'][0]['Synonym'][0] except: ## Some compounds do not have a name but are still a valid pubchem entry. (Peptides) logger.warning('Ligand {} does not have a name in PubChem'.format(pubchem_id)) ligand_name = lookup_type + ' ' + pubchem_id # return None # fetch ligand properties from pubchem properties = {} # check cache cache_dir = ['pubchem', 'cid', 'property'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey,MolecularWeight,HBondDonorCount,HBondAcceptorCount,XLogP,RotatableBondCount/json'.format(lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) # get properties from response if pubchem==False: logger.warning('Ligand {} not found in PubChem'.format(pubchem_id)) return None if pubchem['PropertyTable']['Properties'][0]: if 'HBondAcceptorCount' in pubchem['PropertyTable']['Properties'][0] : properties['hacc'] = pubchem['PropertyTable']['Properties'][0]['HBondAcceptorCount'] if 'HBondDonorCount' in pubchem['PropertyTable']['Properties'][0] : properties['hdon'] = pubchem['PropertyTable']['Properties'][0]['HBondDonorCount'] if 'XLogP' in pubchem['PropertyTable']['Properties'][0] : properties['logp'] = pubchem['PropertyTable']['Properties'][0]['XLogP'] if 'RotatableBondCount' in pubchem['PropertyTable']['Properties'][0] : properties['rotatable_bonds'] = pubchem['PropertyTable']['Properties'][0]['RotatableBondCount'] if 'MolecularWeight' in pubchem['PropertyTable']['Properties'][0] : properties['mw'] = pubchem['PropertyTable']['Properties'][0]['MolecularWeight'] try: properties['smiles'] = pubchem['PropertyTable']['Properties'][0]['CanonicalSMILES'] properties['inchikey'] = pubchem['PropertyTable']['Properties'][0]['InChIKey'] except: logger.warning('Ligand {} not found in PubChem'.format(pubchem_id)) return None # pubchem webresource web_resource = WebResource.objects.get(slug='pubchem') #print (web_resource) # does a ligand with this canonical name already exist try: return Ligand.objects.get(name=ligand_name, canonical=True) # FIXME check inchikey except Ligand.DoesNotExist: pass # continue # does a (canonical) ligand with this inchikey already exist? try: existing_lp = LigandProperities.objects.get(inchikey=properties['inchikey']) self.properities = existing_lp self.name = ligand_name self.canonical = False self.ambigious_alias = False try: self.save() return self except IntegrityError: return Ligand.objects.get(name=ligand_name, canonical=False) except LigandProperities.DoesNotExist: return self.update_ligand(ligand_name, properties, ligand_type, web_resource, pubchem_id)
def handle(self, *args, **options): ## Url API to map genename to ensemble ID cache_dir_genes = ['genenames', 'gene_lookup'] url_gene = 'http://rest.genenames.org/fetch/symbol/$index' ensembl_version = 'grch37' # anything uses newest if ensembl_version=='grch37': ## Url to lookup ensemble ID to find transcripts cache_dir_transcripts = ['ensembl37', 'transcripts'] url_ensembl = 'https://grch37.rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json' ## Url to lookup sequence of transcript cache_dir_seq = ['ensembl37', 'seq'] url_ensembl_seq = 'https://grch37.rest.ensembl.org/sequence/id/$index?content-type=application/json' else: ## Url to lookup ensemble ID to find transcripts cache_dir_transcripts = ['ensembl', 'transcripts'] url_ensembl = 'https://rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json' ## Url to lookup sequence of transcript cache_dir_seq = ['ensembl', 'seq'] url_ensembl_seq = 'https://rest.ensembl.org/sequence/id/$index?content-type=application/json' # Get all human GPCRs ps = Protein.objects.filter(sequence_type__slug='wt', species__common_name="Human", family__slug__startswith='00').all().prefetch_related('genes').order_by('entry_name') isoforms = {} total_transcripts = 0 total_proteins_with_isoforms = 0 gene_to_ensembl = {} for p in ps: transcripts = [] genes = list(p.genes.all().values_list('name',flat=True)) print(">" + p.entry_name, 'genes:',genes) for gene in genes: # Use requests method due to weird functionality of genenames.org import requests url = 'http://rest.genenames.org/fetch/symbol/{}'.format(gene) cache_file_path = '{}/{}'.format('/'.join(cache_dir_genes), gene) # try fetching from cache data = cache.get(cache_file_path) if not data: headers = {'Accept': 'application/json'} try: resp = requests.get(url=url, headers=headers) data = resp.json() cache.set(cache_file_path, data, 60*60*24*7) #7 days except: print('Error converting',gene) continue if data['response']['docs']: try: # Get ensemble_gene_id ensembl_gene_id = data['response']['docs'][0]['ensembl_gene_id'] gene_to_ensembl[p.entry_name] = ensembl_gene_id #print("E_ID: " +ensembl_gene_id) ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts) for t in ensembl_transcripts['Transcript']: display_name = t['display_name'] is_canonical = t['is_canonical'] if is_canonical: # Skip canonical entries continue biotype = t['biotype'] t_id = t['id'] # Only interested in protein_coding if biotype=='protein_coding': length = t['Translation']['length'] seq_id = t['Translation']['id'] transcript_info = OrderedDict([('display_name',display_name),('t_id',t_id),('length',length), ('seq_id',seq_id)]) seq = fetch_from_web_api(url_ensembl_seq, seq_id,cache_dir_seq) transcript_info['seq'] = seq['seq'] transcripts.append(transcript_info) total_transcripts += 1 except: print('Error fetching ensemble_gene_id for gene',gene) pass print(len(transcripts), 'transcripts found') # Add if transcripts found if len(transcripts): isoforms[p.entry_name] = transcripts total_proteins_with_isoforms += 1 # print small summary results print('total_proteins_searched',len(ps)) print('total_proteins_with_isoforms', total_proteins_with_isoforms) print('total_transcripts',total_transcripts) print(gene_to_ensembl) # save to file f = open('protein/data/all_isoforms.json', 'w') json.dump(isoforms,f, indent=4, separators=(',', ': '))
def load_from_pubchem(self, lookup_type, pubchem_id, ligand_type, ligand_title=False): logger = logging.getLogger('build') # if ligand title is specified, use that as the name if ligand_title: ligand_name = ligand_title # otherwise, fetch ligand name from pubchem else: # check cache cache_dir = ['pubchem', 'cid', 'synonyms'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format(lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) # get name from response try: ligand_name = pubchem['InformationList']['Information'][0]['Synonym'][0] except: logger.warning('Ligand {} not found in PubChem'.format(pubchem_id)) return None # fetch ligand properties from pubchem properties = {} # check cache cache_dir = ['pubchem', 'cid', 'property'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey/json'.format(lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) # get properties from reponse try: properties['smiles'] = pubchem['PropertyTable']['Properties'][0]['CanonicalSMILES'] properties['inchikey'] = pubchem['PropertyTable']['Properties'][0]['InChIKey'] except: logger.warning('Ligand {} not found in PubChem'.format(pubchem_id)) return None # pubchem webresource web_resource = WebResource.objects.get(slug='pubchem') # does a ligand with this canonical name already exist try: return Ligand.objects.get(name=ligand_name, canonical=True) # FIXME check inchikey except Ligand.DoesNotExist: pass # continue # does a (canonical) ligand with this inchikey already exist? try: existing_lp = LigandProperities.objects.get(inchikey=properties['inchikey']) self.properities = existing_lp self.name = ligand_name self.canonical = False self.ambigious_alias = False try: self.save() return self except IntegrityError: return Ligand.objects.get(name=ligand_name, canonical=False) except LigandProperities.DoesNotExist: return self.update_ligand(ligand_name, properties, ligand_type, web_resource, pubchem_id)
def AlignIsoformWildtype(request): p = request.GET.get("protein") es = request.GET.getlist("ensembl_id[]") iso = request.GET.get("iso_id") data = {} data['isoforms'] = {} protein = Protein.objects.get(entry_name__startswith=p.lower(), sequence_type__slug='wt', species__common_name='Human') parent_seq = protein.sequence rs = Residue.objects.filter(protein_conformation__protein=protein).prefetch_related('protein_segment','display_generic_number','generic_number') data['res'] = {} data['same'] = "true" for r in rs: data['res'][r.sequence_number] = [r.protein_segment.slug,str(r.display_generic_number), r.sequence_number] from common.tools import fetch_from_web_api from Bio import pairwise2 from Bio.pairwise2 import format_alignment from Bio.SubsMat import MatrixInfo as matlist from Bio.Align.Applications import ClustalOmegaCommandline from Bio import AlignIO cache_dir = ['ensembl', 'isoform'] url = 'https://rest.ensembl.org/sequence/id/$index?content-type=application/json&type=protein' url = 'https://grch37.rest.ensembl.org/sequence/id/$index?type=protein;content-type=application/json' # print(iso,'iso_id') # 1: 3, 2, 5, 3, 7 seq_filename = "protein/data/MSA_GPCR_isoforms/{}_human_isoform_MSA.fa".format(p.lower()) with open (seq_filename, "r") as myfile: fasta_raw = myfile.read() fasta=fasta_raw.splitlines() # print(aln_human) # print(fasta_raw) data['wt2']=fasta[1] data['pre_aligned']=fasta[1+int(iso)*2] new_wt2 = '' new_pre_aligned = '' for i,wt in enumerate(data['wt2']): pa = data['pre_aligned'][i] if not (wt=='-' and pa=='-'): new_wt2 += wt new_pre_aligned += pa gaps = 0 data['res_correct2'] = {} for i, r in enumerate(data['wt2'], 1): if r == "-": data['res_correct2'][i] = ['','',''] gaps += 1 else: data['res_correct2'][i] = data['res'][i-gaps] for e in es[:1]: isoform_info = fetch_from_web_api(url, e, cache_dir) if (isoform_info): seq = isoform_info['seq'] # seq_filename = "/tmp/" + e + ".fa" # with open(seq_filename, 'w') as seq_file: # seq_file.write("> ref\n") # seq_file.write(parent_seq + "\n") # seq_file.write("> seq\n") # seq_file.write(seq + "\n") # ali_filename = "/tmp/"+e +"_out.fa" # acmd = ClustalOmegaCommandline(infile=seq_filename, outfile=ali_filename, force=True) # stdout, stderr = acmd() # pw2 = AlignIO.read(ali_filename, "fasta") # aln_human = str(pw2[0].seq) # aln_isoform = str(pw2[1].seq) pw2 = pairwise2.align.globalms(parent_seq, seq, 2, -5, -10, -.5) # for a in pw2: # print(format_alignment(*a)) aln_human = pw2[0][0] aln_isoform = pw2[0][1] data['wt'] = aln_human data['isoforms'][e]=aln_isoform # print(aln_human) # print(aln_isoform) # with open (ali_filename, "r") as myfile: # fasta=myfile.read() # data['fasta'] = fasta gaps = 0 data['res_correct'] = {} for i, r in enumerate(data['wt'], 1): if r == "-": data['res_correct'][i] = ['','',''] gaps += 1 else: data['res_correct'][i] = data['res'][i-gaps] # print(fasta) # pw = pairwise2.align.globalms(parent_seq, seq, 2, 1, -10, -.5) # for a in pw: # print(format_alignment(*a)) if new_pre_aligned!=aln_isoform: # print(new_pre_aligned,aln_isoform) data['same'] = "false" else: print('error fetching info from',e) # print(data['same']) return JsonResponse(data)
def load_from_pubchem(self, lookup_type, pubchem_id, ligand_type, ligand_title=False): logger = logging.getLogger('build') # if ligand title is specified, use that as the name if ligand_title: ligand_name = ligand_title # otherwise, fetch ligand name from pubchem else: # check cache cache_dir = ['pubchem', 'cid', 'synonyms'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format( lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) ##print (pubchem) # get name from response try: ligand_name = pubchem['InformationList']['Information'][0][ 'Synonym'][0] except: ## Some compounds do not have a name but are still a valid pubchem entry. (Peptides) logger.warning( 'Ligand {} does not have a name in PubChem'.format( pubchem_id)) ligand_name = lookup_type + ' ' + pubchem_id # return None # fetch ligand properties from pubchem properties = {} # check cache cache_dir = ['pubchem', 'cid', 'property'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey,MolecularWeight,HBondDonorCount,HBondAcceptorCount,XLogP,RotatableBondCount/json'.format( lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) # get properties from response if pubchem == False: logger.warning('Ligand {} not found in PubChem'.format(pubchem_id)) return None if pubchem['PropertyTable']['Properties'][0]: if 'HBondAcceptorCount' in pubchem['PropertyTable']['Properties'][ 0]: properties['hacc'] = pubchem['PropertyTable']['Properties'][0][ 'HBondAcceptorCount'] if 'HBondDonorCount' in pubchem['PropertyTable']['Properties'][0]: properties['hdon'] = pubchem['PropertyTable']['Properties'][0][ 'HBondDonorCount'] if 'XLogP' in pubchem['PropertyTable']['Properties'][0]: properties['logp'] = pubchem['PropertyTable']['Properties'][0][ 'XLogP'] if 'RotatableBondCount' in pubchem['PropertyTable']['Properties'][ 0]: properties['rotatable_bonds'] = pubchem['PropertyTable'][ 'Properties'][0]['RotatableBondCount'] if 'MolecularWeight' in pubchem['PropertyTable']['Properties'][0]: properties['mw'] = pubchem['PropertyTable']['Properties'][0][ 'MolecularWeight'] try: properties['smiles'] = pubchem['PropertyTable']['Properties'][0][ 'CanonicalSMILES'] properties['inchikey'] = pubchem['PropertyTable']['Properties'][0][ 'InChIKey'] except: logger.warning('Ligand {} not found in PubChem'.format(pubchem_id)) return None # pubchem webresource web_resource = WebResource.objects.get(slug='pubchem') #print (web_resource) # does a ligand with this canonical name already exist try: return Ligand.objects.get(name=ligand_name, canonical=True) # FIXME check inchikey except Ligand.DoesNotExist: pass # continue # does a (canonical) ligand with this inchikey already exist? try: existing_lp = LigandProperities.objects.get( inchikey=properties['inchikey']) self.properities = existing_lp self.name = ligand_name self.canonical = False self.ambigious_alias = False try: self.save() return self except IntegrityError: return Ligand.objects.get(name=ligand_name, canonical=False) except LigandProperities.DoesNotExist: return self.update_ligand(ligand_name, properties, ligand_type, web_resource, pubchem_id)
def handle(self, *args, **options): ## Prepare comparasion info ## filepath = 'protein/data/Isoform_annotation_table.txt' lmb_data = OrderedDict() total_lmb_isoforms = 0 all_lmb_isoforms = [] with open(filepath, "r", encoding='UTF-8') as f: for i,row in enumerate(f): if i>0: c = row.split("\t") entry_name = "{}_human".format(c[1].lower()) transcripts = c[4].split(", ") if not entry_name in lmb_data: lmb_data[entry_name] = [] lmb_data[entry_name] += transcripts total_lmb_isoforms += 1 all_lmb_isoforms += transcripts print('all_lmb_isoforms',len(all_lmb_isoforms),'distinct',len(set(all_lmb_isoforms))) ## Get parsed gtex annotation with open('protein/data/matched_gtex.json') as json_file: gtex_old = json.load(json_file) ## Need to rewrite these entries, as ensembl doesnt use the . for transcripts gtex = {} for key, val in gtex_old['transcripts'].items(): t,g = key.split("_") new_key = "{}_{}".format(t.split(".")[0],g) gtex[new_key] = val # del gtex[new_key]['subjects'] ## Url API to map genename to ensemble ID cache_dir_genes = ['gtexportal', 'gene_lookup'] url_gene = 'https://gtexportal.org/rest/v1/reference/gene?geneId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19&pageSize=250&format=json' ## Url to lookup ensemble ID to find transcripts cache_dir_transcripts_gtex = ['gtexportal', 'transcripts'] url_transcripts = 'https://gtexportal.org/rest/v1/reference/transcript?gencodeId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19' cache_dir_transcripts = ['ensembl37', 'transcripts'] url_ensembl = 'https://grch37.rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json' cache_dir_gtex_expression = ['gtexportal', 'expression_data'] url_expression = 'https://gtexportal.org/rest/v1/expression/medianTranscriptExpression?datasetId=gtex_v7&gencodeId=$index&format=json' ## Url to lookup sequence of transcript cache_dir_seq = ['ensembl37', 'seq_protein'] url_ensembl_seq = 'https://grch37.rest.ensembl.org/sequence/id/$index?content-type=application/json;type=protein' # Get all human GPCRs ps = Protein.objects.filter(sequence_type__slug='wt', species__common_name="Human", family__slug__startswith='00').all().prefetch_related('genes').order_by('entry_name') isoforms = {} total_transcripts = 0 total_transcript_skipped_no_tissue=0 total_proteins_with_isoforms = 0 gene_to_ensembl = {} transcripts_ids_total = set() transcripts_ids_skipped_total = set() total_fetched_transcripts = 0 canonical_disagreement_count = 0 total_new_transcripts = [] total_not_found = [] total_not_found_due_to_skipped = [] new_proteins = set() lmb_compare_sequences = [0,0,0] # correct, wrong, not exists in lmb sequence_lookup = {} ## COMPARE SEQUENCES filenames = os.listdir("protein/data/LMB_sequences/") all_lmb_sequences= {} for f in filenames: with open ("protein/data/LMB_sequences/"+f, "r") as myfile: fasta=myfile.read().splitlines() for i,l in enumerate(fasta): if l[0]==">": e_id = l[2:] continue if e_id in all_lmb_sequences: print('already there!',e_id) if i>2: all_lmb_sequences[e_id]=l print('all_lmb_sequences',len(all_lmb_sequences)) f = open("protein/data/20190726_transcripts.fa", "w") missing_sequences = 0 total_lmb_sequences = 0 sequences_lookup = defaultdict(list) for p,ts in lmb_data.items(): seq = Protein.objects.get(entry_name=p).sequence sequences_lookup[seq].append([p,p]) # print(p,ts) # print(seq) f.write(">{} GPCRdb sequence reference\n".format(p)) f.write("{}\n".format(seq)) seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p) lmb_sequences = {} try: with open (seq_filename, "r") as myfile: #fasta_raw = myfile.read() fasta=myfile.read().splitlines() for i,l in enumerate(fasta): if l[0]==">": e_id = l[2:] continue lmb_sequences[e_id]=l if i>2: total_lmb_sequences += 1 except: #print('No file for',p,' So no sequence for',ts) missing_sequences += len(ts) for t in ts: if not t in lmb_sequences: #print('missing ',t,'in',"{}_nonstrict_transcripts.fa".format(p)) missing_sequences += 1 seq = fetch_from_web_api(url_ensembl_seq, t,cache_dir_seq)['seq'] sequences_lookup[seq].append([t,p]) if t in lmb_sequences: if seq!=lmb_sequences[t]: print(t,'different from LBM - length ensembl:',len(seq),"length lmb:",len(lmb_sequences[t])) f.write(">{} ({})\n".format(t,p)) f.write("{}\n".format(seq)) f.close() print('total missing sequences',missing_sequences) print('total lmb transcript sequences provided',total_lmb_sequences) print('total lmb protein',len(lmb_data)) #return for seq,ts in sequences_lookup.items(): if len(ts)>1: print('Identical sequence:',ts) sequences_lookup = defaultdict(list) all_transcript_seq = {} for p in ps:# .filter(entry_name='gpc5b_human').all(): transcripts = [] transcripts_ids = [] transcripts_ids_skipped = [] ensembl_transcripts_count = 0 genes = list(p.genes.all().values_list('name',flat=True)) uniprot = p.accession canonical = '' canon_seq = p.sequence # sequence_lookup[canon_seq] = p.entry_name grch37_canonical_seq = '' uniprot_canonical = '' grch37_canonical = '' # print(">" + p.entry_name,uniprot, 'genes:',genes) seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p.entry_name) lmb_sequences = {} try: with open (seq_filename, "r") as myfile: #fasta_raw = myfile.read() fasta=myfile.read().splitlines() for l in fasta: if l[0]==">": e_id = l[2:] continue lmb_sequences[e_id]=l except: pass #break alternative_ids_uniprot = self.find_ensembl_id_by_uniprot(uniprot) # print(alternative_ids_uniprot) ensembl_gene_id = [] for gene in genes: if not gene: continue gene_lookup = fetch_from_web_api(url_gene, gene, cache_dir_genes) # try: same_gene_id = '' if gene_lookup['gene']: for gene_info in gene_lookup['gene']: if gene_info['geneSymbol']==gene: ensembl_gene_id.append(gene_info['gencodeId']) if len(ensembl_gene_id)>1: print(ensembl_gene_id,'MORE THAN 1 !!!!') if len(ensembl_gene_id)==0: print('No ID found, using uniprot') if alternative_ids_uniprot['genes']: ensembl_gene_id = alternative_ids_uniprot['genes'][0] else: print("NO ID FOR THIS RECEPTOR") continue else: ensembl_gene_id = ensembl_gene_id[0] #alternative_id = self.find_ensembl_id(gene) # alternative_id_uniprot = self.find_ensembl_id_by_uniprot(uniprot) # print(ensembl_gene_id,alternative_ids_uniprot) # expression = fetch_from_web_api(url_expression,ensembl_gene_id,cache_dir_gtex_expression) # print(expression) # go through expression # expressed_transcripts = {} # for e in expression['medianTranscriptExpression']: # if e['median']>0 or 1==1: # #only if expression # t_id = e['transcriptId'] # t_short = t_id.split(".")[0] # tissue = e['tissueSiteDetailId'] # if t_short not in expressed_transcripts: # expressed_transcripts[t_short] = {'long':t_id,'tissues':[], 'max_median':0} # if expressed_transcripts[t_short]['max_median']<e['median']: # expressed_transcripts[t_short]['max_median'] = e['median'] # expressed_transcripts[t_short]['tissues'].append([tissue,e['median']]) # print(expressed_transcripts) # print(ensembl_gene_id) gene_to_ensembl[p.entry_name] = ensembl_gene_id # print("E_ID: " +ensembl_gene_id,alternative_ids_uniprot) # ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts) # use uniprot gene ID instead ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts) # print(ensembl_gene_id) if (alternative_ids_uniprot['genes'] and ensembl_gene_id.split(".")[0]!=alternative_ids_uniprot['genes'][0]): print("##### ensembl gene id changed",ensembl_gene_id,alternative_ids_uniprot['genes'][0]) #total_fetched_transcripts += len(ensembl_transcripts['Transcript']) # print(ensembl_transcripts) same_gene_id = True if not ensembl_transcripts: print('error',alternative_ids_uniprot,ensembl_gene_id) same_gene_id = False ensembl_transcripts = fetch_from_web_api(url_ensembl, alternative_ids_uniprot['genes'][0], cache_dir_transcripts) for t in ensembl_transcripts['Transcript']: ensembl_transcripts_count += 1 display_name = t['display_name'] is_canonical = t['is_canonical'] biotype = t['biotype'] t_id = t['id'] # # Skip canonical entries # continue # Only interested in protein_coding if biotype=='protein_coding': total_fetched_transcripts += 1 key = '{}_{}'.format(t_id,ensembl_gene_id) if not key in gtex: # print('t_id', t_id, 'not in expressed_transcripts') total_transcript_skipped_no_tissue += 1 transcripts_ids_skipped_total.add(t_id) transcripts_ids_skipped.append(t_id) continue if gtex[key]["count"]<3: total_transcript_skipped_no_tissue += 1 transcripts_ids_skipped_total.add(t_id) transcripts_ids_skipped.append(t_id) continue length = t['Translation']['length'] seq_id = t['Translation']['id'] transcript_info = OrderedDict([('display_name',display_name),('t_id',t_id),('length',length), ('seq_id',seq_id), ('expressed',gtex[key])]) seq = fetch_from_web_api(url_ensembl_seq, seq_id,cache_dir_seq) if is_canonical: grch37_canonical = t_id transcript_info['grch37_canonical'] = True grch37_canonical_seq = seq['seq'] if seq['seq']==canon_seq: uniprot_canonical = t_id transcript_info['uniprot_canonical'] = True continue # Skip canonical entries sequences_lookup[seq['seq']].append([t_id,p.entry_name]) all_transcript_seq[t_id] = seq['seq'] if seq['seq'] in sequence_lookup: print('SEQUENCE ALREADY SEEN',t_id, sequence_lookup[seq['seq']]) continue sequence_lookup[seq['seq']] = t_id transcript_info['seq'] = seq['seq'] if not t_id in lmb_sequences: transcript_info['lmb_sequences'] = False lmb_compare_sequences[2] += 1 else: if lmb_sequences[t_id]==seq['seq']: transcript_info['lmb_sequences'] = True lmb_compare_sequences[0] += 1 else: transcript_info['lmb_sequences'] = lmb_sequences[t_id] lmb_compare_sequences[1] += 1 if t_id in alternative_ids_uniprot['transcripts']: transcript_info['in_uniprot'] = True else: transcript_info['in_uniprot'] = False if p.entry_name in lmb_data and t_id in lmb_data[p.entry_name]: transcript_info['in_lmb'] = True else: transcript_info['in_lmb'] = False if t_id not in transcripts_ids: transcripts.append(transcript_info) transcripts_ids.append(t_id) transcripts_ids_total.add(t_id) total_transcripts += 1 # except: # print('Error fetching ensemble_gene_id for gene',gene) # pass not_found = [] not_found_due_to_skipped = [] if p.entry_name in lmb_data: for t in lmb_data[p.entry_name]: if t not in transcripts_ids: if t in transcripts_ids_skipped: f = open("protein/data/20190726_skipped_due_to_gtex.txt", "a") not_found_due_to_skipped.append(t) key = '{}_{}'.format(t,ensembl_gene_id) if not key in gtex: reason = 'Not in GTEX' else: reason = 'Subjects low in GTEX - count is {} - subject ids {}'.format(gtex[key]['count'],", ".join(gtex[key]['subjects'])) f.write("{}: {}\n".format(t,reason)) f.close() # print(t) else: not_found.append(t) total_not_found += not_found total_not_found_due_to_skipped += not_found_due_to_skipped new = [] for t in transcripts_ids: if p.entry_name in lmb_data and t in lmb_data[p.entry_name]: pass else: ts_check = sequences_lookup[all_transcript_seq[t]] for t_check in ts_check: if p.entry_name in lmb_data and t_check in lmb_data[p.entry_name]: print('found via duplicate',t_check,t) continue key = '{}_{}'.format(t,ensembl_gene_id) #blast = BlastSearch(top_results=2) blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']), top_results=2) blast_out = blast.run(all_transcript_seq[t]) result = [(Protein.objects.get(pk=x[0]).entry_name, x[1].hsps[0].expect) for x in blast_out] #print(result) if result: if result[0][0]==p.entry_name and result[0][1]<0.05: f = open("protein/data/20190726_new_transcripts_for_consideration.txt", "a") reason = 'GTEX count: {}'.format(gtex[key]['count']) f.write(">{} ({}): {}\n".format(t,p.entry_name,reason)) f.write("{}\n".format(all_transcript_seq[t])) f.close() new.append(t) if p.entry_name in lmb_data: new_proteins.add(p.entry_name) else: print('bad blast match',result) else: print('bad blast match',result) total_new_transcripts += new # print(len(alternative_ids_uniprot['transcripts']), 'uniprot transcripts found',ensembl_transcripts_count, ' ensembl transcripts found',len(transcripts), 'transcripts kept after filtering') # Add if transcripts found if len(transcripts): isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical} if grch37_canonical_seq!=canon_seq: isoforms[p.entry_name]['canonical_disagreement'] = True canonical_disagreement_count += 1 # isoforms[p.entry_name].append(alternative_ids_uniprot) # isoforms[p.entry_name].append(not_found) total_proteins_with_isoforms += 1 else: isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical} # break f = open('protein/data/all_isoforms_gtex.json', 'w') json.dump(isoforms,f, indent=4, separators=(',', ': ')) #break for seq,ts in sequences_lookup.items(): if len(ts)>1: print('identical sequence',ts) for t in total_not_found: ts_check = sequences_lookup[all_transcript_seq[t]] found = False for t_check in ts_check: if t_check[0] not in total_not_found: print(t,'found but under another id',t_check[0]) found = True if not found: print('##',t,'in LMB but not in this search') # print small summary results print('total_proteins_searched',len(ps)) print('total_proteins_with_isoforms', total_proteins_with_isoforms) print('Total transcripts deemed to be isoforms',total_transcripts) print('Amount of these not in LMB data',len(total_new_transcripts)) print(new_proteins) # print('Amount in LBM not found',len(total_not_found)) # print(total_not_found) print('Amount in LBM found but skipped due to GTEX data',len(total_not_found_due_to_skipped)) print(total_not_found_due_to_skipped) print('Sequence compare to LMB', lmb_compare_sequences) print('canonical_disagreement_count',canonical_disagreement_count) print(total_not_found) # print('total_transcript_skipped_no_tissue',total_transcript_skipped_no_tissue) # print('total_transcript_skipped_no_tissue2 ',len(transcripts_ids_skipped_total)) # print('total_fetched_transcripts',total_fetched_transcripts) # print(gene_to_ensembl) # save to file f = open('protein/data/all_isoforms_gtex.json', 'w') json.dump(isoforms,f, indent=4, separators=(',', ': '))
def update_from_doi(self, doi): logger = logging.getLogger('build') # should entrez be tried as a backup? try_entrez_on_fail = False # check whether this data is cached cache_dir = ['crossref', 'doi'] url = 'http://api.crossref.org/works/$index' pub = fetch_from_web_api(url, doi, cache_dir) if pub: # update record try: self.title = pub['message']['title'][0] try: self.year = pub['message']['created']['date-parts'][0][0] except: self.year = pub['message']['deposited']['date-parts'][0][0] # go from [{'family': 'Gloriam', 'given': 'David E.'}] to ['Gloriam DE'] authors = ['{} {}'.format(x['family'], ''.join([y[:1] for y in x['given'].split()])) for x in pub['message']['author']] self.authors = ', '.join(authors) # get volume and pages if available reference = {} fields = ['volume', 'page'] for f in fields: if f in pub['message']: reference[f] = pub['message'][f] else: reference[f] = 'X' self.reference = '{}:{}'.format(reference['volume'], reference['page']) # journal journal = pub['message']['container-title'][0] try: # not all records have the journal abbreviation journal_abbr = pub['message']['container-title'][1] except: journal_abbr = slugify(journal) try: self.journal, created = PublicationJournal.objects.get_or_create(name=journal, defaults={'slug': journal_abbr}) if created: logger.info('Created journal {}'.format(journal)) except IntegrityError: self.journal = PublicationJournal.objects.get(name=journal) except Exception as msg: logger.warning('Processing data from CrossRef for {} failed: {}'.format(doi, msg)) try_entrez_on_fail = False else: print("Publication not on crossref",doi) try_entrez_on_fail = False if try_entrez_on_fail: # try searching entrez for DOI try: Entrez.email = '*****@*****.**' record = Entrez.read(Entrez.esearch( db='pubmed', retmax=1, term=doi )) self.update_from_pubmed_data(record['IdList'][0]) except: return False
def main_func(self, positions, iteration, count, lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value < len(rows): with lock: r = rows[count.value] count.value += 1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) except Exception as msg: print( 'Something errored with ligand, aborting entry of mutation', r['ligand_name'], r['ligand_type'], r['ligand_id'], r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein = Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein = Protein.objects.filter( web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein = protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml=True) try: real_uniprot = uniprot_protein.find( './/{http://uniprot.org/uniprot}name' ).text.lower() protein = Protein.objects.get( entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error( 'Skipped due to no protein ' + r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter( slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get( slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with", r['ligand_class'], slugify(r['ligand_class'])[:50]) l_role, created = LigandRole.objects.get_or_create( slug=slugify(r['ligand_class']) [:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) except: print(r) typefold = r['exp_type'] + "_log" elif "%" == r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round( r['exp_wt_value'] / r['exp_mu_value_raw'], 3) else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange > 0 and foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group=r['submitting_group'], data_container=r['data_container'], data_container_number=r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange, opt_receptor_expression=r['opt_receptor_expression'], opt_basal_activity=r['opt_basal_activity'], opt_gain_of_activity=r['opt_gain_of_activity'], opt_ligand_emax=r['opt_ligand_emax'], opt_agonist=r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) # current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True)
def load_from_pubchem(self, lookup_type, pubchem_id, ligand_type, ligand_title=False): logger = logging.getLogger('build') # if ligand title is specified, use that as the name if ligand_title: ligand_name = ligand_title # otherwise, fetch ligand name from pubchem else: # check cache cache_dir = ['pubchem', 'cid', 'synonyms'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format( lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) # get name from response try: ligand_name = pubchem['InformationList']['Information'][0][ 'Synonym'][0] except: logger.warning( 'Ligand {} not found in PubChem'.format(pubchem_id)) return None # fetch ligand properties from pubchem properties = {} # check cache cache_dir = ['pubchem', 'cid', 'property'] url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey/json'.format( lookup_type) pubchem = fetch_from_web_api(url, pubchem_id, cache_dir) # get properties from reponse try: properties['smiles'] = pubchem['PropertyTable']['Properties'][0][ 'CanonicalSMILES'] properties['inchikey'] = pubchem['PropertyTable']['Properties'][0][ 'InChIKey'] except: logger.warning('Ligand {} not found in PubChem'.format(pubchem_id)) return None # pubchem webresource web_resource = WebResource.objects.get(slug='pubchem') # does a ligand with this canonical name already exist try: return Ligand.objects.get(name=ligand_name, canonical=True) # FIXME check inchikey except Ligand.DoesNotExist: pass # continue # does a (canonical) ligand with this inchikey already exist? try: existing_lp = LigandProperities.objects.get( inchikey=properties['inchikey']) self.properities = existing_lp self.name = ligand_name self.canonical = False self.ambigious_alias = False try: self.save() return self except IntegrityError: return Ligand.objects.get(name=ligand_name, canonical=False) except LigandProperities.DoesNotExist: return self.update_ligand(ligand_name, properties, ligand_type, web_resource, pubchem_id)
def main_func(self, positions, iteration,count,lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value<len(rows): with lock: r = rows[count.value] count.value +=1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) except Exception as msg: print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein=Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein=protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True) try: real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower() protein=Protein.objects.get(entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error('Skipped due to no protein '+ r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] ) l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); except: print(r) typefold = r['exp_type']+"_log" elif "%"==r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3); else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange>0 and foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group = r['submitting_group'], data_container = r['data_container'], data_container_number = r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange, opt_receptor_expression = r['opt_receptor_expression'], opt_basal_activity = r['opt_basal_activity'], opt_gain_of_activity = r['opt_gain_of_activity'], opt_ligand_emax = r['opt_ligand_emax'], opt_agonist = r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)