def quick_getprotinfo(protlist): """get protein information from uniprot database uniprot(http://www.uniprot.org) based on the package of bioservices. input ::= protlist: list of proteins idnm: type of protein names, such as AC. output:: dict of protein information:: Entry name; Gene names; Length; Organism; Protein names; Status. """ u = UniProt(verbose=False) return u.quick_search(protlist)
def search_struc(): u = UniProt() with open(os.path.join(resultspath, "myprot_list.csv"), "r") as infile: with open(os.path.join(resultspath, 'myprot_list_struc.csv'), 'w') as outfile: mywriter = csv.writer(outfile, delimiter=';') myreader = csv.reader(infile, delimiter=';') for row in myreader: if row[0] == "Family": mywriter.writerow(row + ["Uniprot entry", "Struc"]) continue struccell = "" entry = "" uprot = row[3] print("\n\n", row[2]) if uprot: data = u.quick_search("id:%s" % uprot) if data: entry = data[uprot]['Entry name'].lower() struc_res = obtain_struc_pdb(entry) if struc_res: struccell = struc_res print(struccell) else: temp = requests.get( 'http://gpcrdb.org/services/structure/template/' + entry).json() if temp: temp_res = obtain_struc_pdb(temp) if temp_res: struccell = "[Model]: " + temp_res print(struccell) else: print("-----No struc for template") else: print("-----No template") else: print("-----Uprot ID not found") else: print("-----No uprot ID") mywriter.writerow(row + [entry, struccell])
def obtained(liste, output_file): uprot = [] u = UniProt() for ids in liste: sp = " " name = "Escherichia coli (strain K12)" final = str(ids) + sp + name d = u.quick_search(final, limit=1) for entry in d.keys(): uprot.append(entry) dd = u.get_df(uprot) GO = dd["Gene ontology (GO)"] file2 = open(output_file, 'w') file2.write("numero uniprot est le suivant \n \n") for i in range(len(liste)): file2.write(str(liste[i] + " ")) file2.write("\n") file2.write("\n") file2.write("ID accession uniprot est le suivant \n") for i in range(len(uprot)): file2.write(str(uprot[i] + " ")) file2.write("\n") file2.write("\n") file2.write( "L'enrichissement ontologique pour ces termes est la suivante \n\n") for i in range(len(GO)): for j in range(len(GO[i])): file2.write(GO[i][j]) file2.write("\n \n") return GO
def handle(self, *args, **options): def open_json(json_filepath): json_file = open(json_filepath) json_str = json_file.read() json_dict = pd.io.json.loads(json_str) return json_dict def open_dict_or_new(filepath): filepath_obj = Path(filepath) try: filepath_res = filepath_obj.resolve() mydict = open_json(filepath) except FileNotFoundError: mydict = {} return mydict def uniprot_mapping(fromtype, totype, identifier): """Takes an identifier, and types of identifier (to and from), and calls the UniProt mapping service Abbrebiations of Uniprot identifier types can be found here: https://www.uniprot.org/help/api_idmapping """ base = 'http://www.uniprot.org' tool = 'mapping' params = { 'from': fromtype, 'to': totype, 'format': 'tab', 'query': identifier, } #urllib turns the dictionary params into an encoded url suffix data = urllib.parse.urlencode(params) #construct the UniProt URL url = base + '/' + tool + '?' + data #and grab the mapping response = urllib.request.urlopen(url) #response.read() provides tab-delimited output of the mapping return (response.read()) def calc_foldcgange(wt_val, mut_val): if mut_val >= wt_val: fchange = str(round(mut_val / wt_val, 3)) else: fchange = str(-round(wt_val / mut_val, 3)) return fchange def obtain_mut_info(muts_dict, uprot, entry): mut_info = requests.get('http://gpcrdb.org/services/mutants/' + entry).json() if entry not in muts_dict: if mut_info: muts_dict[entry] = {} for pos_mut in mut_info: seqpos = pos_mut["mutation_pos"] if float(pos_mut["exp_wt_value"]) == 0 or float( pos_mut["exp_mu_effect_value"]) == 0: fchange = False else: fchange = calc_foldcgange( pos_mut["exp_wt_value"], pos_mut["exp_mu_effect_value"]) #fchange=str(round(pos_mut["exp_mu_effect_value"]/pos_mut["exp_wt_value"],2)) pos_mut_d = { "from": pos_mut["mutation_from"], "to": pos_mut["mutation_to"], "fchange": fchange, "measure": pos_mut["exp_type"], "unit": pos_mut["exp_wt_unit"], "exp": pos_mut["exp_func"], "lig": pos_mut["ligand_name"], "pub_ref": pos_mut["reference"], "qual": pos_mut["exp_mu_effect_qual"] } if seqpos in muts_dict[entry]: muts_dict[entry][seqpos].append(pos_mut_d) else: muts_dict[entry][seqpos] = [pos_mut_d] else: muts_dict[entry] = {} return muts_dict def obtain_var_info(vars_dict, uprot, entry): uprot_map = uniprot_mapping('ACC+ID', 'ENSEMBL_ID', uprot) if len(uprot_map) <= 8: return vars_dict ens_id = uprot_map.decode().strip("\n").split("\t")[-1] exac = requests.get( "http://exac.hms.harvard.edu/rest/gene/variants_in_gene/" + ens_id).json() if entry not in vars_dict: vars_dict[entry] = {} for exac_var in exac: consequence = exac_var["major_consequence"] if consequence in [ "frameshift_variant", "missense_variant", "stop_gained", "synonymous_variant" ]: #frameshift_variant: p.Val152SerfsTer3 #stop_gained: p.Trp32Ter #missense_variant: p.Leu340Pro #synonymous_variant: p.Thr384Thr var_info = exac_var["HGVSp"] mymatch = re.match( "p\.([A-Za-z]*)(\d*)(([A-Za-z]*).*)", var_info) fromAA = mymatch.group(1) seqNum = mymatch.group(2) toAA = mymatch.group(3) allele_freq = exac_var["allele_freq"] if consequence == "missense_variant" or consequence == "synonymous_variant": from_sAA = aa_short[fromAA.upper()] to_sAA = aa_short[toAA.upper()] elif consequence == "stop_gained": from_sAA = aa_short[fromAA.upper()] to_sAA = toAA elif consequence == "frameshift_variant": from_sAA = aa_short[fromAA.upper()] mymatch2 = re.match("^([A-Za-z]*)(fsTer.*)$", toAA) to_AAnm = mymatch2.group(1) to_rest = mymatch2.group(2) to_AAnm = aa_short[to_AAnm.upper()] to_sAA = to_AAnm + to_rest exac_var_id = exac_var["variant_id"] pos_vars = { "from": from_sAA, "to": to_sAA, "consequence": consequence, "exac_var_id": exac_var_id, "allele_freq": allele_freq } if seqNum in vars_dict[entry]: vars_dict[entry][seqNum].append(pos_vars) else: vars_dict[entry][seqNum] = [pos_vars] return vars_dict mypath = "/protwis/sites/files/Precomputed/muts_vars_info" if not os.path.isdir(mypath): os.makedirs(mypath) vars_filepath = os.path.join(mypath, "gpcr_vars.json") muts_filepath = os.path.join(mypath, "gpcr_muts.json") if options['update']: vars_dict = {} muts_dict = {} else: vars_dict = open_dict_or_new(vars_filepath) muts_dict = open_dict_or_new(muts_filepath) dyn_li = DyndbDynamics.objects.filter(is_published=True) dynobj = dyn_li.annotate(dyn_id=F('id')) dynobj = dynobj.annotate( uniprot=F('id_model__id_protein__uniprotkbac')) dynobj = dynobj.annotate(uniprot2=F( 'id_model__id_complex_molecule__id_complex_exp__dyndbcomplexprotein__id_protein__uniprotkbac' )) dynprotdata = dynobj.values("dyn_id", "uniprot", "uniprot2") dyn_dict = {} for dyn in dynprotdata: dyn_id = dyn["dyn_id"] up = dyn["uniprot"] if not up: up = dyn["uniprot2"] if not up: self.stdout.write( self.style.NOTICE("UniProt ID not found for dyn %s" % (dyn_id))) continue if dyn_id not in dyn_dict: dyn_dict[dyn_id] = [up] else: dyn_dict[dyn_id].append(up) u = UniProt() for dyn_id, uprot_li in dyn_dict.items(): for uprot in uprot_li: data = u.quick_search("id:%s" % uprot) if data: entry = data[uprot]['Entry name'].lower() if options["sel_mv"] == "var_only" or options[ "sel_mv"] == "all": vars_dict = obtain_var_info(vars_dict, uprot, entry) elif options["sel_mv"] == "mut_only" or options[ "sel_mv"] == "all": muts_dict = obtain_mut_info(muts_dict, uprot, entry) else: self.stdout.write( self.style.NOTICE( "No uniprot id found for %s (dyn id:%i)." % (uprot, dyn_id))) if options["sel_mv"] == "var_only" or options["sel_mv"] == "all": with open(vars_filepath, 'w') as outfile: json.dump(vars_dict, outfile) elif options["sel_mv"] == "mut_only" or options["sel_mv"] == "all": with open(muts_filepath, 'w') as outfile: json.dump(muts_dict, outfile)
protlist: list of proteins idnm: type of protein names, such as AC. output:: dict of protein information:: Entry name; Gene names; Length; Organism; Protein names; Status. """ u = UniProt(verbose=False) return u.quick_search(protlist) # if "__name__" == "__main__": u = UniProt(verbose=False) data = u.search("zap70+taxonomy:9606", frmt="tab", limit=3, columns="entry name, length, id, genes") print data res = u.search("DNMT1_HUMAN", frmt="tab", columns="entry name, protein names, pathway, comments") print(res) res = u.quick_search("DNMT1_HUMAN") getreskey = res.keys() res[getreskey]['Gene names'] df = u.get_df("GALK1_HUMAN") df['Length'].hist() plt.show()
class GOTermAdder(object): def __init__(self, input_file, gene_id_column, output_file): self._input_file = input_file self._gene_id_column = gene_id_column self._output_file = output_file self._tmp_folder = "tmp_data" self._uniprot = UniProt(verbose=False) self._quickgo = QuickGO(verbose=False) if os.path.exists(self._tmp_folder) is False: os.mkdir(self._tmp_folder) def add_go_terms(self): with open(self._output_file, "w") as output_fh: for row in csv.reader(open(self._input_file), delimiter="\t"): if len(row[0]) == 0: self._write_row(row, output_fh) continue else: row = self._add_go_term_column(row) self._write_row(row, output_fh) def _write_row(self, row, output_fh): output_fh.write("\t".join(row) + "\n") def _add_go_term_column(self, row): gene_id = self._gene_id(row) uniprot_id = self._uniprot_id(gene_id) if uniprot_id is None: return row go_terms = self._go_terms(uniprot_id) go_term_names = [ self._go_term_name(go_term) for go_term in go_terms] assert len(go_terms) == len(go_term_names) row.append(", ".join( ["%s (%s)" % (go_terms, go_term_names) for go_terms, go_term_names in zip(go_terms, go_term_names)])) return row def _uniprot_id(self, gene_id): file_path = self._tmp_file_path(gene_id) if os.path.exists(file_path) is True: with open(file_path) as json_fh: return json.load(json_fh)["Uniprot"] else: uniprot_id = self._search_uniprot_id(gene_id) gene_data = {"Uniprot" : uniprot_id} with open(file_path, "w") as json_fh: json.dump(gene_data, json_fh) def _go_terms(self, uniprot_id): file_path = self._tmp_file_path(uniprot_id) if os.path.exists(file_path) is True: with open(file_path) as json_fh: return json.load(json_fh)["GO-Terms"] else: uniprot_entry = self._uniprot.searchUniProtId(uniprot_id) go_ids = [] for dbref in uniprot_entry.findAll("dbreference"): if dbref.attrs["type"] == "GO": go_ids.append(dbref.attrs["id"]) go_term_data = {"GO-Terms" : go_ids} with open(file_path, "w") as json_fh: json.dump(go_term_data, json_fh) return go_ids def _go_term_name(self, go_term): file_path = self._tmp_file_path(go_term) if os.path.exists(file_path) is True: with open(file_path) as json_fh: return json.load(json_fh)["name"] else: go_term_info = self._quickgo.Term(go_term).soup go_term_name = go_term_info.term.find("name").text go_term_data = {"name" : go_term_name} with open(file_path, "w") as json_fh: json.dump(go_term_data, json_fh) return go_term_name def _search_uniprot_id(self, gene_id): uniprot_id_search = self._uniprot.quick_search(gene_id) if len(uniprot_id_search) == 1: uniprot_id = uniprot_id_search.keys()[0] return uniprot_id elif len(uniprot_id_search) > 1: pass elif len(uniprot_id_search) > 0: pass def _tmp_file_path(self, gene_id): return "%s/%s.json" % (self._tmp_folder, gene_id) def _gene_id(self, row): return row[self._gene_id_column-1].split("GeneID:")[1].split(";")[0]
class uniprot_queries(): ''' Interrogation of the uniprot database ''' def __init__(self,specie,taxid): self.query = "" self.specie = specie self.taxid = taxid self.dicotested = {} #avoid to test several time the same ID self.unip = UniProt() def create_query(self, query): ''' Create the query that will be used to search the database ''' self.query = query self.query = re.sub('\+','\\+',self.query) self.query = re.sub('"','',self.query) self.query = self.query.replace(':','') self.query = self.query.replace(';','') cq='' #change cq in function of the specie specifications if self.specie != '': if self.taxid != '': cq = self.query + ' AND organism: ' + self.specie + ' [' + self.taxid + ']' else: cq = self.query + ' AND organism: ' + self.specie else: cq = self.query return cq def uniprot_request(self, cq): ''' Request to the uniprot database using the uniprot module from bioservices ''' for n in range(10): try: cq=re.sub('^\[','',cq) # remove the [ when it start by it d = self.unip.quick_search(cq,limit=1) #I take the best result of the match break except(zeep.exceptions.Fault, zeep.exceptions.TransportError, TypeError, AttributeError): d={} pass return d def query_id(self, query): ''' Use bioservice research uniprot ID corresponding to a query RETURN: uniprotID, uniprot entry name, uniprot gene names ''' cq = self.create_query(query) if not cq in self.dicotested.keys():#cq not already tested d = self.uniprot_request(cq) if(len(d.keys())>0): #at least one result unipID = d.keys()[0] for gene in d[unipID]['Gene names'].split(' '): if clean(query) == clean(gene): tabtosave=[unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']] self.dicotested[cq] = tabtosave return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names'] if len(query.split('_')) > 1: query = query.split('_')[0] if clean(query) == clean(gene): tabtosave=[unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']] self.dicotested[cq] = tabtosave return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names'] for protein in d[unipID]['Protein names'].split('('): protein = protein.strip() protein = re.sub("\)$",'',protein) protein = protein.replace('+','') if clean(query) == clean(protein): tabtosave = [unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']] self.dicotested[cq] = tabtosave return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names'] #By Cecile: if(re.search(clean(protein),clean(query))): tabtosave=[unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']] self.dicotested[cq] = tabtosave return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names'] tabtosave = ['','','',''] self.dicotested[cq] = tabtosave return '', '', '', '' else: return self.dicotested[cq][0], self.dicotested[cq][1], self.dicotested[cq][2], self.dicotested[cq][3] def mapping_id(self, prot_id, database): ''' Map a protein id to obtain the information from the uniprot database This function became deprecated after bioservices uniprot stopped working, not allowing the retrieval of the mapping id. In turn, I created the mapping_id2 functiion to make API requests to uniprot db ''' if not prot_id in self.dicotested.keys(): #mapping = self.unip.mapping(fr=database, to='ID', query = prot_id) # No longer used since bioservices stoped working #print mapping #sys.exit() mapping = self.mapping_id2(prot_id, database) if len(mapping) > 1: for k in mapping: value = k.split('\t')[1] break else: tabtosave = ['','','',''] self.dicotested[prot_id] = tabtosave return '', '', '', '' cq = self.create_query(value) r = self.unip.quick_search(value, limit=1) if len(r.keys()) > 0: unipID = r.keys()[0] tabtosave = [unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names']] self.dicotested[prot_id] = tabtosave return unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names'] tabtosave = ['','','',''] self.dicotested[cq] = tabtosave return '', '', '', '' else: return self.dicotested[prot_id][0], self.dicotested[prot_id][1], self.dicotested[prot_id][2], self.dicotested[prot_id][3] def mapping_id2(self, prot_id, database): ''' ''' import urllib,urllib2 url = 'https://www.uniprot.org/uploadlists/' params = { 'from': database, 'to':'ID', 'format':'tab', 'query': prot_id } data = urllib.urlencode(params) request = urllib2.Request(url, data) contact = "" # Please set a contact email address here to help us debug in case of problems (see https://www.uniprot.org/help/privacy). request.add_header('User-Agent', 'Python %s' % contact) for n in range(10): # deal with problems to connect to the server try: response = urllib2.urlopen(request) page = response.read(200000) page = page.split('\n')[1:] break except (urllib2.HTTPError, httplib.BadStatusLine): page = '' pass return page def uni_search(self, protein): ''' Do the quick search but not trying to find the ''' if not protein in self.dicotested.keys(): cq = self.create_query(protein) r = self.unip.quick_search(cq, limit=1) unipID = r.keys()[0] tabtosave = [unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names']] self.dicotested[protein] = tabtosave return unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names'] else: return self.dicotested[protein][0], self.dicotested[protein][1], self.dicotested[protein][2], self.dicotested[protein][3] return None