def main(): """ This function undo gene to protein merges. For that, a query searches for WD items which have the Entrez gene ID (P351) and Uniprot ID (P352) on one item. Bases on that, it generates instances of MergeDefender and undoes the merges. :return: None """ print(sys.argv[1]) # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1]) conflict_set_1 = {'P351'} conflict_set_2 = {'P352'} likely_merged_ids = PBB_Core.WDItemList(wdquery='CLAIM[351] AND CLAIM[352]') print(likely_merged_ids.wditems['items']) for count, x in enumerate(likely_merged_ids.wditems['items']): print('\n', count) print('Q{}'.format(x)) try: MergeDefender(login, merge_target='Q{}'.format(x), conflict_set_1=conflict_set_1, conflict_set_2=conflict_set_2) except Exception as e: traceback.print_exc() PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}"'.format( main_data_id=x, exception_type=type(e), message=e.__str__(), ))
def __init__(self): self.start = time.time() self.content = ET.fromstring(self.download_disease_ontology()) self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # self.updateDiseaseOntologyVersion() # Get all WikiData entries that contain a WikiData ID print("Getting all terms with a Disease Ontology ID in WikiData") doWikiData_id = dict() DoInWikiData = PBB_Core.WDItemList("CLAIM[699]", "699") print("Getting latest version of Disease Ontology from Github") r = requests.get( "https://api.github.com/repos/DiseaseOntology/HumanDiseaseOntology/git/refs" ) test = r.json() sha = test[0]["object"]["sha"] githubReferenceUrl = "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/" + sha + "/src/ontology/doid.owl" for diseaseItem in DoInWikiData.wditems["props"]["699"]: doWikiData_id[str(diseaseItem[2])] = diseaseItem[ 0] # diseaseItem[2] = DO identifier, diseaseItem[0] = WD identifier for doClass in self.content.findall( './/owl:Class', DiseaseOntology_settings.getDoNameSpaces()): try: disVars = [] disVars.append(doClass) disVars.append(githubReferenceUrl) disVars.append(doWikiData_id) disVars.append(self.logincreds) disVars.append(self.start) diseaseClass = disease(disVars) print("do_id: " + diseaseClass.do_id) print(diseaseClass.wdid) print(diseaseClass.name) print(diseaseClass.synonyms) print(diseaseClass.xrefs) except Exception as e: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=diseaseClass.do_id, exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start)) f = open('/tmp/Diseaseexceptions.txt', 'a') # f.write("Unexpected error:", sys.exc_info()[0]+'\n') f.write(diseaseClass.do_id + "\n") #f.write(diseaseClass.wd_json_representation) traceback.print_exc(file=f) f.close()
def main(): pwd = input('Password:'******'ProteinBoxBot', pwd=pwd) # for mouse genes # LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:83310]').wditems['items'], {'gène': 'gène de souris'}, # 'fr', login) # for human genes LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]').wditems['items'], {'gène': 'gène humain'}, 'fr', login)
def __init__(self): self.start = time.time() self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # Get all WikiData entries that contain a WikiData ID print("Getting all terms with a Gene Ontology ID in WikiData") goWikiData_id = dict() goInWikiData = PBB_Core.WDItemList("CLAIM[686]", "686") for goItem in goInWikiData.wditems["props"]["686"]: goWikiData_id[str(goItem[2])] = goItem[ 0] # diseaseItem[2] = go identifier, diseaseItem[0] = go identifier print(len(goWikiData_id.keys())) sys.exit() graph = rdflib.Graph() goUrl = requests.get("http://purl.obolibrary.org/obo/go.owl") print("ja") graph.parse(data=goUrl.text, format="application/rdf+xml") cls = URIRef("http://www.w3.org/2002/07/owl#Class") subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") counter = 0 for gouri in graph.subjects(RDF.type, cls): try: counter = counter + 1 print(counter) goVars = dict() goVars["uri"] = gouri goVars["label"] = graph.label(URIRef(gouri)) goVars["wikidata_id"] = goWikiData_id goVars["logincreds"] = self.logincreds goVars["start"] = self.start goVars["graph"] = graph if "GO" in gouri: goClass = goTerm(goVars) except Exception as e: print(traceback.format_exc()) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gouri, exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start))
def __init__(self): self.content = json.loads(self.download_mouse_proteins()) # print self.content["results"]["bindings"] self.protein_count = len(self.content["results"]["bindings"]) self.proteins = self.content["results"]["bindings"] self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) uniprotWikidataIds = dict() print "Getting all proteins with a uniprot ID in Wikidata" InWikiData = PBB_Core.WDItemList("CLAIM[703:83310] AND CLAIM[352]", "352") r0 = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a10090+.%0d%0a%7d&format=srj" ) for proteinItem in InWikiData.wditems["props"]["352"]: try: uniprotWikidataIds[str(proteinItem[2])] = proteinItem[0] r = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + str(proteinItem[2]) + "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName+%3fupversion&format=srj" ) # r = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"+str(proteinItem[2])+"%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName&format=srj") print r.text protein = json.loads(r.text) protein["logincreds"] = self.logincreds protein["wdid"] = 'Q' + str(proteinItem[0]) print protein proteinClass = mouse_protein(protein) except: # client = Client('http://*****:*****@sentry.sulab.org/9') # client.captureException() print "There has been an except" print "Unexpected error:", sys.exc_info()[0] f = open('/tmp/exceptions.txt', 'a') # f.write("Unexpected error:", sys.exc_info()[0]+'\n') f.write( str(protein["results"]["bindings"][0]["uniprot"]["value"]) + "\n") traceback.print_exc(file=f) f.close()
def __init__(self): self.start = time.time() self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # Get all WikiData entries that contain a WikiData ID print("Getting all terms with a Uberon ID in WikiData") ubWikiData_id = dict() ubInWikiData = PBB_Core.WDItemList("CLAIM[1554]", "1554") for uberonItem in ubInWikiData.wditems["props"]["1554"]: ubWikiData_id[str(uberonItem[2])]=uberonItem[0] # diseaseItem[2] = Uberon identifier, diseaseItem[0] = Uberon identifier graph = rdflib.Graph() ubUrl = requests.get("http://purl.obolibrary.org/obo/uberon.owl") print("ja") graph.parse(data=ubUrl.text, format="application/rdf+xml") cls = URIRef("http://www.w3.org/2002/07/owl#Class") subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") for uberonuri in graph.subjects(RDF.type, cls): try: uberonVars = dict() uberonVars["uberon"] = uberonuri uberonVars["uberonLabel"] = graph.label(URIRef(uberonuri)) uberonVars["wikidata_id"] = ubWikiData_id uberonVars["logincreds"] = self.logincreds uberonVars["start"] = self.start uberonVars["graph"] = graph if "UBERON" in uberonuri: uberonClass = uberonTerm(uberonVars) except Exception as e: print(traceback.format_exc()) PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format( main_data_id=uberonuri, exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start ))
def __init__(self, login): self.login_obj = login image_data = pd.read_csv( './image_data/gene_wiki_images_with_preferred.txt', encoding='utf-8', sep='\t', dtype={'entrez': np.str}) wdq_results = PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]', '351').wditems wd_entrez_ids = list(map(lambda z: z[2], wdq_results['props']['351'])) entrez_qid_list = list( map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['351'])) print(len(wd_entrez_ids)) for index in image_data.index: start = time.time() # print(image_data.loc[index, 'other_images']) image_names = image_data.loc[index, 'other_images'] preferred_image = image_data.loc[index, 'primary_image'] image_file_extension = ['.png', '.jpg', '.jpeg', '.pdf'] if pd.notnull(preferred_image) and '|' in preferred_image: for splt in preferred_image.split('|'): for ending in image_file_extension: if ending in splt: preferred_image = splt break entrez = image_data.loc[index, 'entrez'] # print(entrez) protein_images = [] protein_image_value_store = [] genex_images = [] genex_value_store = [] if entrez not in wd_entrez_ids: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='Entrez ID not yet in Wikidata!!', wd_id='', duration=time.time() - start)) continue else: curr_qid = entrez_qid_list[wd_entrez_ids.index(entrez)] if pd.isnull(image_names): PBB_Core.WDItemEngine.log( 'WARNING', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='No images available for this Entrez ID', wd_id=curr_qid, duration=time.time() - start)) continue for sub_string in image_names.split('|'): if 'PBB GE ' in sub_string: value = sub_string[5:] # if value[-6:-4] == 'tn': # value = value[:-6] + 'fs' + value[-4:] # Gene Expression reference: https://www.wikidata.org/wiki/Q21074956 genex_images.append(value) genex_value_store.append( PBB_Core.WDCommonsMedia(value=value, prop_nr='P692')) elif 'PDB ' in sub_string: value = sub_string[5:] protein_images.append(value) protein_image_value_store.append( PBB_Core.WDCommonsMedia(value, prop_nr='')) entrez_id_value = PBB_Core.WDString(value=entrez, prop_nr='P351') data = [entrez_id_value] data.extend(genex_value_store) if pd.notnull(preferred_image): data.append( PBB_Core.WDCommonsMedia(value=preferred_image, prop_nr='P18')) try: gene_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid, domain='genes', data=data) # pprint.pprint(gene_item.get_wd_json_representation()) gene_item.write(self.login_obj) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='success', wd_id=curr_qid, duration=time.time() - start)) print(index, 'success', curr_qid, entrez, gene_item.get_label(lang='en')) except Exception as e: print(index, 'error', curr_qid, entrez) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type=type(e), message=e.__str__(), wd_id=curr_qid, duration=time.time() - start))
params = { 'action': 'wbsearchentities', 'format': 'json', 'language': 'en', 'type': 'item', 'search': searchTerm } data = requests.get(url, params=params) reply = json.loads(data.text, "utf-8") if len(reply['search']) == 0: sys.exit("A new version of DO has been release, a full update is required") else: doVersionID = reply['search'][0]['id'] doWikiData_id = dict() DoInWikiData = PBB_Core.WDItemList("CLAIM[699]", "699") for doClass in content.findall('.//owl:Class', DiseaseOntology_settings.getDoNameSpaces()): try: do_id = doClass.findall( './/oboInOwl:id', DiseaseOntology_settings.getDoNameSpaces())[0].text if do_id == sys.argv[1]: disVars = [] disVars.append(doClass) disVars.append(doVersionID) disVars.append(doWikiData_id) disVars.append(logincreds) disVars.append(start) diseaseClass = DiseaseOntology.disease(disVars)
speciesInfo["rat"]["taxid"] = "10114" speciesInfo["rat"]["wdid"] = "Q36396" speciesInfo["rat"]["name"] = "rat" speciesInfo["rat"]["release"] = "Q19296606" if len(sys.argv) == 1: print("Please provide an ncbi gene ID") print("Example: python singleGeneBot.py 628") sys.exit() logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) entrezWikidataIds = dict() wdqQuery = "CLAIM[351]" InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351") ''' Below a mapping is created between entrez gene ids and wikidata identifiers. ''' for geneItem in InWikiData.wditems["props"]["351"]: entrezWikidataIds[str(geneItem[2])] = geneItem[0] uniprotwikidataids = dict() print('Getting all proteins with a uniprot ID in Wikidata...') inwikidata = PBB_Core.WDItemList("CLAIM[352]", "352") for proteinItem in inwikidata.wditems["props"]["352"]: uniprotwikidataids[str(proteinItem[2])] = proteinItem[0] try: object=dict() object["entrezgene"] = str(sys.argv[1])
while (line != ""): alreadyAdded.append(line.strip()) line = added.readline() f1 = open('alreadyAdded.txt', 'a+') mygeneinfo_url = "http://mygene.info/v2/query?q=_exists_:wikipedia&fields=wikipedia,entrezgene&size=15000" r = requests.get(mygeneinfo_url) mappings = r.json() PBB_Debug.prettyPrint(mappings) # Get entrezgene - Wikidata mapping entrezWikidataIds = dict() wdqQuery = "CLAIM[351]" InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351") logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) for geneItem in InWikiData.wditems["props"]["351"]: entrezWikidataIds[int(geneItem[2])] = geneItem[0] for hit in mappings["hits"]: print(hit["entrezgene"]) f1.write(str(hit["entrezgene"]) + "\n") data2add = [] try: if hit["entrezgene"] in entrezWikidataIds.keys( ) and hit["wikipedia"]["url_stub"].count("?") == 0 and str( hit["entrezgene"]) not in alreadyAdded: print(entrezWikidataIds[hit["entrezgene"]])
] data_types = {x: object for x in col_names} data_types.update({'has interwiki link': bool}) append = True if os.path.isfile('./WD_to_WP_disease_map.csv') and append: wd_to_wp_map = pd.read_csv('./WD_to_WP_disease_map.csv', index_col=0, dtype=data_types) else: wd_to_wp_map = pd.DataFrame(columns=col_names) wd_disease_items = PBB_Core.WDItemList( 'CLAIM[279:12136] or CLAIM[279:929833] or CLAIM[31:12136] ' 'or CLAIM[31:929833] or CLAIM[557] or CLAIM[699] or claim[493] ' 'or claim[494] or claim[1995]').wditems['items'] print(wd_to_wp_map.dtypes) print('Total number of items to match:', len(wd_disease_items)) for count, item in enumerate(wd_disease_items): print(item) if str(item) in wd_to_wp_map['QID'].values and append: print('skipping', item) count += 1 continue wd_object = PBB_Core.WDItemEngine(wd_item_id='Q{}'.format(item)) wd_json = wd_object.wd_json_representation
def __init__(self): self.start = time.time() self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) uniprotwikidataids = dict() genesymbolwdmapping = dict() print('Getting all proteins with a uniprot ID in Wikidata...') inwikidata = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[352]", "352") for proteinItem in inwikidata.wditems["props"]["352"]: uniprotwikidataids[str(proteinItem[2])] = proteinItem[0] print('Getting all human genes with a ncbi gene ID in Wikidata...') entrezWikidataIds = dict() print("wdq 1") wdqQuery = "CLAIM[703:5] AND CLAIM[351]" InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351") ''' Below a mapping is created between entrez gene ids and wikidata identifiers. ''' for geneItem in InWikiData.wditems["props"]["351"]: entrezWikidataIds[str(geneItem[2])] = geneItem[0] print("Getting all human proteins from Uniprot...") # r0 = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj") r0 = requests.get( 'http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj' ) prot_results = r0.json() uniprot_ids = [] for protein in prot_results["results"]["bindings"]: item = dict() item["id"] = protein["protein"]["value"].replace( "http://purl.uniprot.org/uniprot/", "") item["label"] = protein["protein_label"]["value"] uniprot_ids.append(item) for up in uniprot_ids: try: #if up["id"] not in uniprotwikidataids: ''' Get protein annotations from Uniprot ''' #r = requests.get( # "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + # str(up["id"]) + # "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09%09%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj") r = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3fncbiGene%3b+separator%3d%22%3b+%22)+as+%3fgene_id)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + str(up["id"]) + "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09++++%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++optional%7b%3funiprot+rdfs%3aseeAlso+%3fncbiGene+.%0d%0a++++++++%3fncbiGene+up%3adatabase+database%3aGeneID+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj" ) protein = r.json() if len(protein["results"]["bindings"]) == 0: raise Exception("Communication error on " + up["id"]) #if "results" not in protein.keys(): ''' Get go annotations from Uniprot ''' r2 = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e+%0d%0aSELECT+DISTINCT+%3fprotein+%3fgo+%3fgoLabel+%3fparentLabel%0d%0aWHERE%0d%0a%7b%0d%0a++%09%09VALUES+%3fprotein+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + str(up["id"]) + "%3e%7d%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+up%3aclassifiedWith+%3fgo+.+++%0d%0a++++++++%3fgo+rdfs%3alabel+%3fgoLabel+.%0d%0a++++++++%3fgo+rdfs%3asubClassOf*+%3fparent+.%0d%0a++++++++%3fparent+rdfs%3alabel+%3fparentLabel+.%0d%0a++++++++optional+%7b%3fparent+rdfs%3asubClassOf+%3fgrandParent+.%7d%0d%0a++++++++FILTER+(!bound(%3fgrandParent))%0d%0a%7d&format=srj" ) go_terms = r2.json() protein["goTerms"] = go_terms protein["logincreds"] = self.logincreds # protein["label"] = up["label"] protein["id"] = up["id"] protein["start"] = self.start protein["geneSymbols"] = genesymbolwdmapping protein["entrezWikidataIds"] = entrezWikidataIds protein_class = HumanProtein(protein) #else: #print(up["id"]+" already covered in wikidata") except Exception as e: print(traceback.format_exc()) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=up["id"], exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start))
import json import humanprotein if len(sys.argv) == 1: print("Please provide an Uniprot ID") print("Example: python singleGeneBot.py P12345") sys.exit() start = time.time() logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) uniprotwikidataids = dict() genesymbolwdmapping = dict() print('Getting all proteins with a uniprot ID in Wikidata...') inwikidata = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[352]", "352") for proteinItem in inwikidata.wditems["props"]["352"]: uniprotwikidataids[str(proteinItem[2])] = proteinItem[0] print("Getting all human proteins in Wikidata...") gene_symbol_mapping = PBB_Core.WDItemList("CLAIM[353] AND CLAIM[703:5]", "353") for genesymbol in gene_symbol_mapping.wditems["props"]["353"]: genesymbolwdmapping[str(genesymbol[2])] = genesymbol[0] try: up = str(sys.argv[1]) ''' Get protein annotations from Uniprot ''' r = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"
wdPage = PBB_Core.WDItemEngine(wd_item_id=self.source, data=[orthologValue], server="www.wikidata.org", domain="genes") print(wdPage.wd_json_representation) wdPage.write(self.logincreds) logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) humanEntrezWikidataIds = dict() mouseEntrezWikidataIds = dict() print("Getting all human genes in Wikidata") InWikiData = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[351]", "351") for geneItem in InWikiData.wditems["props"]["351"]: humanEntrezWikidataIds[str(geneItem[2])] = geneItem[0] print("Getting all mouse genes in Wikidata") InWikiData = PBB_Core.WDItemList("CLAIM[703:83310] AND CLAIM[351]", "351") for geneItem in InWikiData.wditems["props"]["351"]: mouseEntrezWikidataIds[str(geneItem[2])] = geneItem[0] homologene = open("/tmp/homologene.data", "r") humanOrthologs = dict() mouseOrthologs = dict() for line in homologene: for line in homologene: fields = line.split('\t')
import PBB_Core import requests import copy import pprint from SPARQLWrapper import SPARQLWrapper, JSON from time import gmtime, strftime # This bot extends gene items in Wikidata with gene disease relationship information from the OMIM-sourced # downloadable dump in Phenocarta. Currently, the bot writes to the genes specified in the source code, # and uses the genetic association property with references. # Get Wikidata Ids for all entrez genes in Wikidata. ncbi_gene_wikidata_ids = dict() print("Getting all terms with a Disease Ontology ID in WikiData (WDQ)") wdqQuery = "CLAIM[351]" ncbi_gene_in_wikidata = PBB_Core.WDItemList(wdqQuery, wdprop="351") for geneItem in ncbi_gene_in_wikidata.wditems["props"]["351"]: ncbi_gene_wikidata_ids[str(geneItem[2])] = geneItem[0] gnsym_gemma_ids = dict( ) # maps gene symbols to Gemma/Phenocarta-specific gene IDs, for reference URLs # Retrieve gene-disease relationships from Phenocarta. source = "http://www.chibi.ubc.ca/Gemma/phenocarta/LatestEvidenceExport/AnnotationsByDataset/OMIM.tsv" result = requests.get(source, stream=True) for line in result.iter_lines(): # First separate each tuple into distinct fields. values = dict() s = str(line) fields = s.split("\\t") if "#" not in fields[0]:
import sys import os sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../../ProteinBoxBot_Core") import PBB_Core import PBB_Debug import PBB_login import PBB_settings import pprint import sys gene_ontologydataids = dict() print('Getting all proteins with a uniprot ID in Wikidata...') inwikidata = PBB_Core.WDItemList("CLAIM[686] and CLAIM[351]", "686") for goItem in inwikidata.wditems["props"]["686"]: if not "GO:" in str(goItem[2]): gene_ontologydataids[str(goItem[2])] = goItem[0] pprint.pprint(gene_ontologydataids) print(len(gene_ontologydataids)) logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) prep = dict() for id in gene_ontologydataids.keys(): print(id) wdid = 'Q' + str(gene_ontologydataids[str(id)]) prep["P686"] = [PBB_Core.WDBaseDataType.delete_statement(prop_nr='P686')] data2Add = [] for key in prep.keys():