def interwiki_link(entrez, name): # Query wikidata for Q-item id (cid) cid_query = """ SELECT ?cid WHERE { ?cid wdt:P351 ?entrez_id . FILTER(?entrez_id ='""" + str(entrez) + """') . } """ wikidata_results = PBB_Core.WDItemEngine.execute_sparql_query( prefix=settings.PREFIX, query=cid_query)['results']['bindings'] cid = '' for x in wikidata_results: cid = x['cid']['value'].split('/')[-1] # create interwiki link username = models.CharField(max_length=200, blank=False) password = models.CharField(max_length=200, blank=False) # create your login object with your user and password (or the ProteinBoxBot account?) login_obj = PBB_login.WDLogin(user=username, pwd=password) # load the gene Wikidata object wd_gene_item = PBB_Core.WDItemEngine(wd_item_id=cid) # set the interwiki link to the correct Wikipedia page wd_gene_item.set_sitelink(site='enwiki', title=name) # write the changes to the item wd_gene_item.write(login_obj)
def main(): """ This function undo gene to protein merges. For that, a query searches for WD items which have the Entrez gene ID (P351) and Uniprot ID (P352) on one item. Bases on that, it generates instances of MergeDefender and undoes the merges. :return: None """ print(sys.argv[1]) # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1]) conflict_set_1 = {'P351'} conflict_set_2 = {'P352'} likely_merged_ids = PBB_Core.WDItemList(wdquery='CLAIM[351] AND CLAIM[352]') print(likely_merged_ids.wditems['items']) for count, x in enumerate(likely_merged_ids.wditems['items']): print('\n', count) print('Q{}'.format(x)) try: MergeDefender(login, merge_target='Q{}'.format(x), conflict_set_1=conflict_set_1, conflict_set_2=conflict_set_2) except Exception as e: traceback.print_exc() PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}"'.format( main_data_id=x, exception_type=type(e), message=e.__str__(), ))
def main(): parser = argparse.ArgumentParser( description='Gene Ontology prefix cleaner') parser.add_argument('--user', action='store', help='Username on Wikidata', required=True) parser.add_argument('--pwd', action='store', help='Password on Wikidata', required=True) parser.add_argument('--prefix', action='store', help='The prefix which should be added', required=True) parser.add_argument('--prop-nr', action='store', help='The Wikidata property number where the ' 'prefixes need to be checked and fixed', required=True) parser.add_argument('--separator', action='store', help='The separator character between prefix ' 'and actual identifier. ":" as default.', required=False, default=':') args = parser.parse_args() print(args.user, args.pwd, args.prefix, args.prop_nr, args.separator) login = PBB_login.WDLogin(user=args.user, pwd=args.pwd) GOCleaner(login, prop_nr=args.prop_nr, prefix_str=args.prefix, separator=args.separator)
def main(): print(sys.argv[1]) # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1]) # biological process (GO:0008150), molecular function (GO:0003674), cellular component (GO:0005575) (Q5058355) root_objects = ['0008150', '0003674', '0005575'] # continue_at = '' # stop_at = '' file_name = 'temp_GO_onto_map.json' if os.path.exists(file_name): f = open(file_name, 'r') local_qid_onto_map = json.loads(f.read()) f.close() else: local_qid_onto_map = {} # Ontology ref item is the Wikidata 'Gene Ontolgy' item OBOImporter(root_objects=root_objects, ontology='GO', core_property_nr='P686', ontology_ref_item='Q135085', login=login, local_qid_onto_map=local_qid_onto_map, use_prefix=True, fast_run=True, fast_run_base_filter={'P686': ''})
def __init__(self): self.start = time.time() self.content = ET.fromstring(self.download_disease_ontology()) self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # self.updateDiseaseOntologyVersion() # Get all WikiData entries that contain a WikiData ID print("Getting all terms with a Disease Ontology ID in WikiData") doWikiData_id = dict() DoInWikiData = PBB_Core.WDItemList("CLAIM[699]", "699") print("Getting latest version of Disease Ontology from Github") r = requests.get( "https://api.github.com/repos/DiseaseOntology/HumanDiseaseOntology/git/refs" ) test = r.json() sha = test[0]["object"]["sha"] githubReferenceUrl = "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/" + sha + "/src/ontology/doid.owl" for diseaseItem in DoInWikiData.wditems["props"]["699"]: doWikiData_id[str(diseaseItem[2])] = diseaseItem[ 0] # diseaseItem[2] = DO identifier, diseaseItem[0] = WD identifier for doClass in self.content.findall( './/owl:Class', DiseaseOntology_settings.getDoNameSpaces()): try: disVars = [] disVars.append(doClass) disVars.append(githubReferenceUrl) disVars.append(doWikiData_id) disVars.append(self.logincreds) disVars.append(self.start) diseaseClass = disease(disVars) print("do_id: " + diseaseClass.do_id) print(diseaseClass.wdid) print(diseaseClass.name) print(diseaseClass.synonyms) print(diseaseClass.xrefs) except Exception as e: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=diseaseClass.do_id, exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start)) f = open('/tmp/Diseaseexceptions.txt', 'a') # f.write("Unexpected error:", sys.exc_info()[0]+'\n') f.write(diseaseClass.do_id + "\n") #f.write(diseaseClass.wd_json_representation) traceback.print_exc(file=f) f.close()
def main(): pwd = input('Password:'******'ProteinBoxBot', pwd=pwd) # for mouse genes # LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:83310]').wditems['items'], {'gène': 'gène de souris'}, # 'fr', login) # for human genes LabelReplacement(PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]').wditems['items'], {'gène': 'gène humain'}, 'fr', login)
def __init__(self): self.start = time.time() self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # Get all WikiData entries that contain a WikiData ID print("Getting all terms with a Gene Ontology ID in WikiData") goWikiData_id = dict() goInWikiData = PBB_Core.WDItemList("CLAIM[686]", "686") for goItem in goInWikiData.wditems["props"]["686"]: goWikiData_id[str(goItem[2])] = goItem[ 0] # diseaseItem[2] = go identifier, diseaseItem[0] = go identifier print(len(goWikiData_id.keys())) sys.exit() graph = rdflib.Graph() goUrl = requests.get("http://purl.obolibrary.org/obo/go.owl") print("ja") graph.parse(data=goUrl.text, format="application/rdf+xml") cls = URIRef("http://www.w3.org/2002/07/owl#Class") subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") counter = 0 for gouri in graph.subjects(RDF.type, cls): try: counter = counter + 1 print(counter) goVars = dict() goVars["uri"] = gouri goVars["label"] = graph.label(URIRef(gouri)) goVars["wikidata_id"] = goWikiData_id goVars["logincreds"] = self.logincreds goVars["start"] = self.start goVars["graph"] = graph if "GO" in gouri: goClass = goTerm(goVars) except Exception as e: print(traceback.format_exc()) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=gouri, exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start))
def __init__(self): self.content = json.loads(self.download_mouse_proteins()) # print self.content["results"]["bindings"] self.protein_count = len(self.content["results"]["bindings"]) self.proteins = self.content["results"]["bindings"] self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) uniprotWikidataIds = dict() print "Getting all proteins with a uniprot ID in Wikidata" InWikiData = PBB_Core.WDItemList("CLAIM[703:83310] AND CLAIM[352]", "352") r0 = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a10090+.%0d%0a%7d&format=srj" ) for proteinItem in InWikiData.wditems["props"]["352"]: try: uniprotWikidataIds[str(proteinItem[2])] = proteinItem[0] r = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + str(proteinItem[2]) + "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName+%3fupversion&format=srj" ) # r = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f"+str(proteinItem[2])+"%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fplabel+%3fecName&format=srj") print r.text protein = json.loads(r.text) protein["logincreds"] = self.logincreds protein["wdid"] = 'Q' + str(proteinItem[0]) print protein proteinClass = mouse_protein(protein) except: # client = Client('http://*****:*****@sentry.sulab.org/9') # client.captureException() print "There has been an except" print "Unexpected error:", sys.exc_info()[0] f = open('/tmp/exceptions.txt', 'a') # f.write("Unexpected error:", sys.exc_info()[0]+'\n') f.write( str(protein["results"]["bindings"][0]["uniprot"]["value"]) + "\n") traceback.print_exc(file=f) f.close()
def __init__(self): self.start = time.time() self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # Get all WikiData entries that contain a WikiData ID print("Getting all terms with a Uberon ID in WikiData") ubWikiData_id = dict() ubInWikiData = PBB_Core.WDItemList("CLAIM[1554]", "1554") for uberonItem in ubInWikiData.wditems["props"]["1554"]: ubWikiData_id[str(uberonItem[2])]=uberonItem[0] # diseaseItem[2] = Uberon identifier, diseaseItem[0] = Uberon identifier graph = rdflib.Graph() ubUrl = requests.get("http://purl.obolibrary.org/obo/uberon.owl") print("ja") graph.parse(data=ubUrl.text, format="application/rdf+xml") cls = URIRef("http://www.w3.org/2002/07/owl#Class") subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") for uberonuri in graph.subjects(RDF.type, cls): try: uberonVars = dict() uberonVars["uberon"] = uberonuri uberonVars["uberonLabel"] = graph.label(URIRef(uberonuri)) uberonVars["wikidata_id"] = ubWikiData_id uberonVars["logincreds"] = self.logincreds uberonVars["start"] = self.start uberonVars["graph"] = graph if "UBERON" in uberonuri: uberonClass = uberonTerm(uberonVars) except Exception as e: print(traceback.format_exc()) PBB_Core.WDItemEngine.log('ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format( main_data_id=uberonuri, exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start ))
def main(): print(sys.argv[1]) # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1]) root_objects = ['11946'] OBOImporter.obo_synonyms = { 'SMILES': 'P233', 'InChIKey': 'P235', 'FORMULA': 'P274' } file_name = 'temp_GO_onto_map.json' if os.path.exists(file_name): f = open(file_name, 'r') local_qid_onto_map = json.loads(f.read()) f.close() else: local_qid_onto_map = {} # Ontology ref item is the Wikidata 'Gene Ontolgy' item OBOImporter(root_objects=root_objects, ontology='CHEBI', core_property_nr='P683', ontology_ref_item='Q902623', login=login, local_qid_onto_map=local_qid_onto_map)
def __init__(self): self.start = time.time() self.logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) uniprotwikidataids = dict() genesymbolwdmapping = dict() print('Getting all proteins with a uniprot ID in Wikidata...') inwikidata = PBB_Core.WDItemList("CLAIM[703:5] AND CLAIM[352]", "352") for proteinItem in inwikidata.wditems["props"]["352"]: uniprotwikidataids[str(proteinItem[2])] = proteinItem[0] print('Getting all human genes with a ncbi gene ID in Wikidata...') entrezWikidataIds = dict() print("wdq 1") wdqQuery = "CLAIM[703:5] AND CLAIM[351]" InWikiData = PBB_Core.WDItemList(wdqQuery, wdprop="351") ''' Below a mapping is created between entrez gene ids and wikidata identifiers. ''' for geneItem in InWikiData.wditems["props"]["351"]: entrezWikidataIds[str(geneItem[2])] = geneItem[0] print("Getting all human proteins from Uniprot...") # r0 = requests.get("http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj") r0 = requests.get( 'http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+taxonomy%3a+%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+xsd%3a+%3chttp%3a%2f%2fwww.w3.org%2f2001%2fXMLSchema%23%3e%0d%0aSELECT+DISTINCT+*%0d%0aWHERE%0d%0a%7b%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++++++++%3fprotein+up%3areviewed+%22true%22%5e%5exsd%3aboolean+.%0d%0a++%09%09%3fprotein+rdfs%3alabel+%3fprotein_label+.%0d%0a++++++++%3fprotein+up%3aorganism+taxonomy%3a9606+.%0d%0a%7d&format=srj' ) prot_results = r0.json() uniprot_ids = [] for protein in prot_results["results"]["bindings"]: item = dict() item["id"] = protein["protein"]["value"].replace( "http://purl.uniprot.org/uniprot/", "") item["label"] = protein["protein_label"]["value"] uniprot_ids.append(item) for up in uniprot_ids: try: #if up["id"] not in uniprotwikidataids: ''' Get protein annotations from Uniprot ''' #r = requests.get( # "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + # str(up["id"]) + # "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09%09%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj") r = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e%0d%0aPREFIX+taxonomy%3a%3chttp%3a%2f%2fpurl.uniprot.org%2ftaxonomy%2f%3e%0d%0aPREFIX+database%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fdatabase%2f%3e%0d%0aSELECT+%3funiprot+%3fplabel+%3fecName+%3fupversion+%0d%0a+++++++(group_concat(distinct+%3fencodedBy%3b+separator%3d%22%3b+%22)+as+%3fencoded_by)%0d%0a+++++++(group_concat(distinct+%3fncbiGene%3b+separator%3d%22%3b+%22)+as+%3fgene_id)%0d%0a+++++++(group_concat(distinct+%3falias%3b+separator%3d%22%3b+%22)+as+%3fupalias)%0d%0a+++++++(group_concat(distinct+%3fpdb%3b+separator%3d%22%3b+%22)+as+%3fpdbid)%0d%0a+++++++(group_concat(distinct+%3frefseq%3b+separator%3d%22%3b+%22)+as+%3frefseqid)%0d%0a+++++++(group_concat(distinct+%3fensP%3b+separator%3d%22%3b+%22)+as+%3fensemblp)%0d%0aWHERE%0d%0a%7b%0d%0a%09%09VALUES+%3funiprot+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + str(up["id"]) + "%3e%7d%0d%0a++++++++%3funiprot+rdfs%3alabel+%3fplabel+.%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.+%0d%0a++++++++%3funiprot+up%3aencodedBy+%3fgene+.%0d%0a%09++++%3fgene+skos%3aprefLabel+%3fencodedBy+.%0d%0a++++++++optional%7b%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++%3fupAlias+up%3aecName+%3fecName+.%7d%0d%0a++++++++optional%7b%3funiprot+rdfs%3aseeAlso+%3fncbiGene+.%0d%0a++++++++%3fncbiGene+up%3adatabase+database%3aGeneID+.%7d%0d%0a++++++++%0d%0a++++++++OPTIONAL%7b+%3funiprot+up%3aalternativeName+%3fupAlias+.%0d%0a++++++++++%7b%3fupAlias+up%3afullName+%3falias+.%7d+UNION%0d%0a++++++++%7b%3fupAlias+up%3ashortName+%3falias+.%7d%7d%0d%0a++++++++%3funiprot+up%3aversion+%3fupversion+.%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fpdb+.%0d%0a++++++++%3fpdb+up%3adatabase+database%3aPDB+.%7d%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3frefseq+.%0d%0a++++++++%3frefseq+up%3adatabase+database%3aRefSeq+.%7d++%0d%0a++++++++OPTIONAL%7b%3funiprot+rdfs%3aseeAlso+%3fensT+.%0d%0a++++++++%3fensT+up%3adatabase+database%3aEnsembl+.%0d%0a++++++++%3fensT+up%3atranslatedTo+%3fensP+.%7d%0d%0a%7d%0d%0agroup+by+%3fupAlias+%3funiprot+%3fencodedBy+%3fplabel+%3fecName+%3fupversion&format=srj" ) protein = r.json() if len(protein["results"]["bindings"]) == 0: raise Exception("Communication error on " + up["id"]) #if "results" not in protein.keys(): ''' Get go annotations from Uniprot ''' r2 = requests.get( "http://sparql.uniprot.org/sparql?query=PREFIX+up%3a%3chttp%3a%2f%2fpurl.uniprot.org%2fcore%2f%3e+%0d%0aPREFIX+skos%3a%3chttp%3a%2f%2fwww.w3.org%2f2004%2f02%2fskos%2fcore%23%3e+%0d%0aSELECT+DISTINCT+%3fprotein+%3fgo+%3fgoLabel+%3fparentLabel%0d%0aWHERE%0d%0a%7b%0d%0a++%09%09VALUES+%3fprotein+%7b%3chttp%3a%2f%2fpurl.uniprot.org%2funiprot%2f" + str(up["id"]) + "%3e%7d%0d%0a%09%09%3fprotein+a+up%3aProtein+.%0d%0a++%09%09%3fprotein+up%3aclassifiedWith+%3fgo+.+++%0d%0a++++++++%3fgo+rdfs%3alabel+%3fgoLabel+.%0d%0a++++++++%3fgo+rdfs%3asubClassOf*+%3fparent+.%0d%0a++++++++%3fparent+rdfs%3alabel+%3fparentLabel+.%0d%0a++++++++optional+%7b%3fparent+rdfs%3asubClassOf+%3fgrandParent+.%7d%0d%0a++++++++FILTER+(!bound(%3fgrandParent))%0d%0a%7d&format=srj" ) go_terms = r2.json() protein["goTerms"] = go_terms protein["logincreds"] = self.logincreds # protein["label"] = up["label"] protein["id"] = up["id"] protein["start"] = self.start protein["geneSymbols"] = genesymbolwdmapping protein["entrezWikidataIds"] = entrezWikidataIds protein_class = HumanProtein(protein) #else: #print(up["id"]+" already covered in wikidata") except Exception as e: print(traceback.format_exc()) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=up["id"], exception_type=type(e), message=e.__str__(), wd_id='-', duration=time.time() - self.start))
import time import pprint try: import simplejson as json except ImportError as e: import json import traceback start = time.time() if len(sys.argv) == 1: print("Please provide an Disease Ontology ID") print("Example: python single_disease_bot.py 628") sys.exit() logincreds = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) content = ET.fromstring(requests.get(DiseaseOntology_settings.getdoUrl()).text) doDate = content.findall('.//oboInOwl:date', DiseaseOntology_settings.getDoNameSpaces()) doversion = content.findall('.//owl:versionIRI', DiseaseOntology_settings.getDoNameSpaces()) dateList = doDate[0].text.split(' ')[0].split(":") searchTerm = "Disease ontology release " + dateList[2] + "-" + dateList[ 1] + "-" + dateList[0] url = 'https://www.wikidata.org/w/api.php' params = { 'action': 'wbsearchentities', 'format': 'json', 'language': 'en', 'type': 'item',
def main(): def read_file(url): if not os.path.exists('./data'): os.makedirs('./data') file_name = url.split('/')[-1] file_path = './data/{}'.format(file_name) if not read_local or not os.path.isfile(file_path): requests_ftp.monkeypatch_session() s = requests.Session() if url.startswith('ftp://'): reply = s.retr(url, stream=True) else: reply = s.get(url, stream=True) with open(file_path, 'wb') as f: for chunk in reply.iter_content(chunk_size=2048): if chunk: f.write(chunk) f.flush() if file_name.endswith('.gz'): f = gzip.open(file_path, 'rt') else: f = open(file_path, 'rt') cnt = 0 while f: line = f.readline() if line is None or line == '': break cnt += 1 if cnt % 100000 == 0: print('count: ', cnt) yield line def get_uniprot_for_entrez(): # query WD for all entrez IDs (eukaryotic) # query Uniprot for all high quality annotated Uniprots based on the entrez id. query = ''' SELECT * WHERE { ?gene wdt:P351 ?entrez . {?gene wdt:P703 wd:Q5} UNION {?gene wdt:P703 wd:Q83310} . {?gene wdt:P354 ?res_id} UNION {?gene wdt:P671 ?res_id} . } ''' results = PBB_Core.WDItemEngine.execute_sparql_query( query=query)['results']['bindings'] entrez_to_qid = dict() global res_id_to_entrez_qid res_id_to_entrez_qid = dict() for z in results: # ensure that the correct prefix exists so the identifier can be found in the Uniprot XML file res_id = z['res_id']['value'] entrez_qid = z['gene']['value'].split('/')[-1] entrez_id = z['entrez']['value'] if len(res_id.split(':')) <= 1: res_id = 'HGNC:' + z['res_id']['value'] entrez_to_qid[entrez_id] = (entrez_qid, res_id) res_id_to_entrez_qid.update({res_id: (entrez_qid, entrez_id)}) print('Wikidata Entrez query complete') uniprot_to_qid = get_all_wd_uniprots() print('Wikidata Uniprot query complete') up_prefix = ''' PREFIX taxon:<http://purl.uniprot.org/taxonomy/> PREFIX up:<http://purl.uniprot.org/core/> PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> ''' headers = { 'content-type': 'application/sparql-results+json', 'charset': 'utf-8' } up_query = ''' SELECT DISTINCT * WHERE { ?uniprot rdfs:seeAlso ?gene . ?uniprot up:reviewed ?reviewed . {?uniprot up:organism taxon:9606} UNION {?uniprot up:organism taxon:10090} . FILTER regex(?gene, "^http://purl.uniprot.org/geneid/") } GROUP BY ?uniprot ?gene ?reviewed ''' query_string = up_prefix + up_query data = {'format': 'srj', 'query': query_string} if read_local and os.path.isfile('uniprot_entrez_map.json'): with open('uniprot_entrez_map.json', 'r') as f: results = json.load(f) else: reply = requests.post(url='http://sparql.uniprot.org/sparql/', params=data, headers=headers) results = reply.json() with open('uniprot_entrez_map.json', 'w') as of: json.dump(results, of) print('Uniprot query complete') uniprot_map = dict() for ids in results['results']['bindings']: entrez_id = ids['gene']['value'].split('/')[-1] uniprot_id = ids['uniprot']['value'].split('/')[-1] reviewed = False if ids['reviewed']['value'] == 'true': reviewed = True if reviewed or (not reviewed and uniprot_id in uniprot_to_qid): if entrez_id not in entrez_to_qid: print('Entrez ID {} not in Wikidata'.format(entrez_id)) continue if uniprot_id not in uniprot_to_qid: protein_qid = '' else: protein_qid = uniprot_to_qid[uniprot_id] uniprot_map[uniprot_id] = { 'entrez': { 'id': entrez_id, 'qid': entrez_to_qid[entrez_id][0], 'res_id': entrez_to_qid[entrez_id][1] }, 'qid': protein_qid } # Uniprot items in WD without a link to a gene should also be updated, therefore add them to uniprot_map, # keep entrez empty. for wd_protein_item in uniprot_to_qid: if wd_protein_item not in uniprot_map: uniprot_map[wd_protein_item] = { 'entrez': { 'id': '', 'qid': '', 'res_id': '' }, 'qid': uniprot_to_qid[wd_protein_item] } return uniprot_map def get_all_wd_uniprots(): query = ''' SELECT * WHERE { ?protein wdt:P352 ?uniprot . {?protein wdt:P703 wd:Q5} UNION {?protein wdt:P703 wd:Q83310} . } ''' results = PBB_Core.WDItemEngine.execute_sparql_query( query=query)['results']['bindings'] return { z['uniprot']['value']: z['protein']['value'].split('/')[-1] for z in results } def get_go_map(): query = ''' SELECT * WHERE { ?qid wdt:P686 ?go . } ''' results = PBB_Core.WDItemEngine.execute_sparql_query( query=query)['results']['bindings'] go_to_qid = dict() for z in results: go_to_qid[z['go']['value']] = { 'qid': z['qid']['value'].split('/')[-1], 'go_class_prop': '' } return go_to_qid def get_pdb_to_uniprot(): file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_pdb.csv.gz' pdb_uniprot_map = dict() for c, line in enumerate(read_file(file)): if c < 2: print(line) continue dt = line.strip('\n').split(',') if dt[0] not in pdb_uniprot_map: pdb_uniprot_map[dt[0]] = dt[1].split(';') return pdb_uniprot_map def const_go_map(): base_dict = {'go_terms': list(), 'evidence': list(), 'pdb': set()} file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_go.csv.gz' pdb_go_map = dict() for c, line in enumerate(read_file(file)): if c < 2: print(line) c += 1 continue dt = line.strip('\n').split(',') uniprot = copy.copy(dt[2]) if uniprot not in base_map: continue if uniprot not in pdb_go_map: pdb_go_map[uniprot] = copy.deepcopy(base_dict) pdb_go_map[uniprot]['go_terms'].append(dt[-1]) pdb_go_map[uniprot]['evidence'].append(dt[-2]) pdb_go_map[uniprot]['pdb'].add(dt[0]) print('total number of PDBs', len(pdb_go_map)) pdb_to_uniprot = get_pdb_to_uniprot() for uniprot in pdb_to_uniprot: if uniprot in pdb_go_map: pdb_go_map[uniprot]['pdb'].update(pdb_to_uniprot[uniprot]) else: pdb_go_map[uniprot] = copy.deepcopy(base_dict) pdb_go_map[uniprot]['pdb'] = set(pdb_to_uniprot[uniprot]) entrez_to_uniprot = { base_map[z]['entrez']['id']: z for z in base_map if base_map[z]['entrez']['id'] } # Download and process latest human and mouse GO term annotation files files = [ 'http://geneontology.org/gene-associations/gene_association.goa_human.gz', 'http://geneontology.org/gene-associations/gene_association.mgi.gz' ] for file in files: for line in read_file(file): if line.startswith('!'): continue cols = line.split('\t') uniprot = cols[1] go_id = cols[4] evidence = cols[6] go_class = cols[8] if cols[0] == 'MGI': try: mgi = cols[1] entrez = res_id_to_entrez_qid[mgi][1] uniprot = entrez_to_uniprot[entrez] except KeyError: continue if uniprot not in pdb_go_map: pdb_go_map[uniprot] = copy.deepcopy(base_dict) pdb_go_map[uniprot]['go_terms'].append(go_id) pdb_go_map[uniprot]['evidence'].append(evidence) try: go_prop_map[go_id][ 'go_class_prop'] = ProteinBot.get_go_class( go_id, go_class) except KeyError: print('GO term {} not yet in Wikidata'.format(go_id)) continue return pdb_go_map parser = argparse.ArgumentParser(description='ProteinBot parameters') parser.add_argument('--run-locally', action='store_true', help='Locally stored data files and run progress ' 'will be used. Acts also as if continuing a run.') parser.add_argument('--user', action='store', help='Username on Wikidata', required=True) parser.add_argument('--pwd', action='store', help='Password on Wikidata', required=True) args = parser.parse_args() read_local = args.run_locally login = PBB_login.WDLogin(user=args.user, pwd=args.user) # generate a basic mapping of Uniprot to Entrez and Wikidata genes and proteins base_map = get_uniprot_for_entrez() # generate mappings of GO terms to their Wikidata QIDs go_prop_map = get_go_map() # generate a map of Uniprot IDs with the matches PDB IDs, GO term and GO evidence codes pdb_to_go = const_go_map() if read_local and os.path.isfile('uniprot_progress.json'): with open('uniprot_progress.json', 'r') as infile: progress = json.load(infile) else: progress = dict() for count, x in enumerate(base_map): if x in progress: continue pprint.pprint(x) pprint.pprint(base_map[x]) ProteinBot(uniprot=x, base_map=base_map, pdb_to_go=pdb_to_go, go_prop_map=go_prop_map, login=login, progress=progress) with open('uniprot_progress.json', 'w') as outfile: json.dump(progress, outfile)
" Usage: MicrobeBotModularPackage.py <Wikidata user name> <Wikidata Password> <run number> <domain " "i.e. genes/proteins/encode_genes/encode_proteins>, <number of genomes to process> " ) sys.exit() else: pass def chunks(l, n): """Yield successive n-sized chunks from l.""" for c in range(0, len(l), n): yield l[c:c + n] # Login to Wikidata with bot credentials login = PBB_login.WDLogin(sys.argv[1], sys.argv[2]) # Retrieve Current Bacterial Reference Genomes from NCBI print('Retrieving current list of NCBI Bacterial Reference Genomes') print('Standby...') genome_records = MBR.get_ref_microbe_taxids() ref_taxids = genome_records['taxid'].tolist() # break up list of taxids into chunks of 5 for subruns count = 0 runs_list = chunks(ref_taxids, int(sys.argv[5])) taxids = {} for i in runs_list: count += 1
def main(): prefix = ''' PREFIX schema: <http://schema.org/> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> ''' query = ''' SELECT ?entrez_id ?cid ?article ?label WHERE { ?cid wdt:P351 ?entrez_id . ?cid wdt:P703 wd:Q5 . OPTIONAL { ?cid rdfs:label ?label filter (lang(?label) = "en") . } ?article schema:about ?cid . ?article schema:inLanguage "en" . FILTER (SUBSTR(str(?article), 1, 25) = "https://en.wikipedia.org/") . FILTER (SUBSTR(str(?article), 1, 38) != "https://en.wikipedia.org/wiki/Template") } ''' print(sys.argv[1]) sparql_results = PBB_Core.WDItemEngine.execute_sparql_query(query=query, prefix=prefix) curr_date = datetime.datetime.now() end_date = datetime.date(year=curr_date.year, month=curr_date.month, day=1) - datetime.timedelta(days=1) start_date = datetime.date(year=end_date.year, month=end_date.month, day=1) total_views = 0 from_timestamp = '{}00'.format(start_date.strftime('%Y%m%d')) to_timestamp = '{}00'.format(end_date.strftime('%Y%m%d')) all_items = list() url = 'https://en.wikipedia.org/w/api.php' for count, i in enumerate(sparql_results['results']['bindings']): article = i['article']['value'].split('/')[-1] print(article) r = requests.get(url='https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/' 'en.wikipedia/all-access/user/{}/daily/{}/{}'.format(article, from_timestamp, to_timestamp)) article_views = 0 if 'items' in r.json(): for day in r.json()['items']: article_views += day['views'] total_views += article_views params = { 'action': 'query', 'prop': 'pageprops|info', 'titles': urllib.parse.unquote(article), 'format': 'json' } page_size = 0 size_results = requests.get(url=url, params=params).json()['query']['pages'] for x in size_results.values(): page_size = x['length'] all_items.append((urllib.parse.unquote(article), article_views, page_size)) # do some printing for the user print(count, 'article views: ', article_views, 'total views: ', total_views, 'mean views: ', total_views/(count + 1), 'page size:', page_size) if count % 100 == 0: # print top accessed pages all_items.sort(key=lambda z: z[1], reverse=True) pprint.pprint(all_items[0:10]) # print largest pages all_items.sort(key=lambda z: z[2], reverse=True) pprint.pprint(all_items[0:10]) else: pprint.pprint(r.text) # final sort and print top accessed pages all_items.sort(key=lambda z: z[1], reverse=True) pprint.pprint(all_items[0:10]) table_data = [all_items[0:10]] # print largest pages all_items.sort(key=lambda z: z[2], reverse=True) pprint.pprint(all_items[0:10]) table_data.append(all_items[0:10]) login = PBB_login.WDLogin(user='******', pwd=sys.argv[1], server='en.wikipedia.org') # get page text params = { 'action': 'query', 'titles': 'Portal:Gene_Wiki/Quick_Links', 'prop': 'revisions', 'rvprop': 'content', 'format': 'json' } page_text = [x['revisions'][0]['*'] for x in requests.get(url=url, params=params).json()['query']['pages'].values()][0] re_pattern = re.match(re.compile('^{.*?}', re.DOTALL), page_text) wp_string = \ '''{{| align="right" border="1" style="text-align:center" cellpadding="0" cellspacing="0" class="wikitable" |+ Top Gene Wiki articles (as of {}. 1, {}) ! Rank !! by size (word count) !! by page views in {}., {}{} |}}''' wp_table_row = ''' |- |{0} | [[{1}]] | [[{2}]]''' tmp_string = '' for i in range(1, 11): tmp_string += wp_table_row.format(i, table_data[1][i - 1][0], table_data[0][i - 1][0]) table_string = wp_string.format(curr_date.strftime("%B")[0:3], curr_date.year, end_date.strftime("%B")[0:3], end_date.year, tmp_string) print(table_string + page_text[re_pattern.end():]) params = { 'action': 'edit', 'title': 'Portal:Gene_Wiki/Quick_Links', 'section': '0', 'text': table_string + page_text[re_pattern.end():], 'token': login.get_edit_token(), 'format': 'json' } r = requests.post(url=url, data=params, cookies=login.get_edit_cookie()) pprint.pprint(r.json())
def main(): # 'https://www.ebi.ac.uk/chembl/api/data/drug_indication/?molecule_chembl_id=CHEMBL1637&limit=100&format=json' # params = { # 'molecule_chembl_id': 'CHEMBL1637', # 'limit': '1000', # 'format': 'json' # } # # url = 'https://www.ebi.ac.uk/chembl/api/data/drug_indication' # # r = requests.get(url, params=params) # pprint.pprint(r.json()) # # 'https://www.ebi.ac.uk/chembl/api/data/drug_indication.json?limit=1000&offset=0' # get_parent_molecule('CHEMBL2364968') chembl_wd_map = get_id_wd_map('P592') mesh_wd_map = get_id_wd_map('P486') ndfrt_wd_map = get_id_wd_map('P2115') wd_ndfrt_map = {ndfrt_wd_map[x]: x for x in ndfrt_wd_map} # contains drug QIDs as keys, and a dict of 'disease_qid', 'source_id' as keys. values are disease item QID and the # db identifier for NDF-RT or CHEMBL. drug_disease_map = dict() if os.path.isfile('drug_disease.json'): with open('drug_disease.json', 'r') as infile: drug_disease_map = json.load(infile) for nui in ndfrt_wd_map: diseases = get_ndfrt_drug_links(nui) drug_qid = ndfrt_wd_map[nui] for disease_mesh in diseases: if not disease_mesh: continue elif disease_mesh in mesh_wd_map: disease_qid = mesh_wd_map[disease_mesh] else: print('Disease not found in Wikidata:', disease_mesh, diseases[disease_mesh]) continue if drug_qid in drug_disease_map: drug_disease_map[drug_qid]['disease_qid'].append(disease_qid) drug_disease_map[drug_qid]['source_id'].append(nui) else: drug_disease_map.update({ drug_qid: { 'disease_qid': [disease_qid], 'source_id': [nui] } }) # pprint.pprint(drug_disease_map) if os.path.isfile('full_drug_disease_map.json'): with open('full_drug_disease_map.json', 'r') as infile: drug_disease_map = json.load(infile) else: all_indications = get_all_chembl_indications() all_indications.to_csv('all_chembl_indications.csv', index=False) unique_chembl_ids = all_indications['molecule_chembl_id'].unique() chembl_to_parent = dict() unique_mesh_ids = all_indications['mesh_id'].unique() for chembl_id in unique_chembl_ids: # print('chembl id:', chembl_id) if chembl_id in chembl_wd_map: curr_chembl = chembl_id else: parent_chembl = get_parent_molecule(chembl_id) chembl_to_parent.update({chembl_id: parent_chembl}) curr_chembl = parent_chembl if curr_chembl not in chembl_wd_map: print(curr_chembl, 'not found in Wikidata') continue curr_drug_qid = chembl_wd_map[curr_chembl] chembl_id_df = all_indications[all_indications['molecule_chembl_id'] == curr_chembl] # pprint.pprint(chembl_id_df) for x in chembl_id_df.index: curr_mesh = chembl_id_df.loc[x, 'mesh_id'] # print('this current mesh', curr_mesh) if pd.notnull(curr_mesh) and curr_mesh in mesh_wd_map: print(curr_chembl, curr_mesh, 'pair found', 'index', x) disease_qid = mesh_wd_map[curr_mesh] if curr_drug_qid in drug_disease_map: if disease_qid not in drug_disease_map[curr_drug_qid]['disease_qid']: drug_disease_map[curr_drug_qid]['disease_qid'].append(disease_qid) drug_disease_map[curr_drug_qid]['source_id'].append(chembl_id) else: drug_disease_map.update({ curr_drug_qid: { 'disease_qid': [disease_qid], 'source_id': [chembl_id] } }) with open('full_drug_disease_map.json', 'w') as outfile: json.dump(drug_disease_map, outfile) print(sys.argv[1]) login = PBB_login.WDLogin(user='******', pwd=sys.argv[1]) for count, drug in enumerate(drug_disease_map): statements = list() for c, disease in enumerate(drug_disease_map[drug]['disease_qid']): ref_source_id = drug_disease_map[drug]['source_id'][c] references = generate_refs(ref_source_id) statements.append(PBB_Core.WDItemID(value=disease, prop_nr='P2175', references=references)) try: item = PBB_Core.WDItemEngine(wd_item_id=drug, data=statements) item_qid = item.write(login) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print('write failed to drug item:', drug) print(e) # if count > 2: # break disease_drug_map = {z: {'drug_qid': list(), 'source_id': list()} for x in drug_disease_map for z in drug_disease_map[x]['disease_qid']} for count, drug in enumerate(drug_disease_map): for c, disease in enumerate(drug_disease_map[drug]['disease_qid']): source = drug_disease_map[drug]['source_id'][c] disease_drug_map[disease]['drug_qid'].append(drug) disease_drug_map[disease]['source_id'].append(source) for count, disease in enumerate(disease_drug_map): statements = list() for c, drug in enumerate(disease_drug_map[disease]['drug_qid']): ref_source_id = disease_drug_map[disease]['source_id'][c] references = generate_refs(ref_source_id) statements.append(PBB_Core.WDItemID(value=drug, prop_nr='P2176', references=references)) try: item = PBB_Core.WDItemEngine(wd_item_id=disease, data=statements) item_qid = item.write(login) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print('write failed to disease item:', disease) print(e)
def main(): print(sys.argv[1], sys.argv[2]) # pwd = input('Password:'******''' PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX schema: <http://schema.org/> ''' missing_go_query = ''' SELECT distinct ?protein ?label WHERE { ?protein wdt:P279 wd:Q8054 . ?protein wdt:P703 wd:Q5 . OPTIONAL { ?protein rdfs:label ?label filter (lang(?label) = "en") . #?article schema:about ?protein . } FILTER NOT EXISTS {?protein wdt:P351 ?m} . FILTER NOT EXISTS {?protein wdt:P352 ?n} . FILTER NOT EXISTS {?protein wdt:P31 wd:Q21996465} . FILTER NOT EXISTS {?protein wdt:P31 wd:Q14633939} . } #GROUP BY ?protein ''' results = PBB_Core.WDItemEngine.execute_sparql_query(prefix=prefix, query=missing_go_query)['results']['bindings'] start_time = time.time() for count, x in enumerate(results): protein_qid = x['protein']['value'].split('/')[-1] # pprint.pprint(x) if 'label' in x: label = x['label']['value'] else: print('No label found for', protein_qid) print_item(protein_qid) gene_qid = lookup_symbol(symbol=label) print('count:', count, 'Gene QID:', gene_qid) if gene_qid is not None: decision = input('Merge? (y):') if decision == 'y': merge(merge_from=protein_qid, merge_to=gene_qid, login_obj=login_obj) else: # Protein class/family Q417841 # protein complex Q14633939 decision = input('Protein class? (p):\nProtein complex? (c)\nSearch (s):') if decision == 's': s_qids, s_labels, s_descr, s_aliases = get_wd_search_results(search_string=label) for s_count, s in enumerate(s_qids): print(s_count, s_qids[s_count], s_labels[s_count], s_descr[s_count], s_aliases[s_count]) decision = input('Select by number:') try: number = int(decision) merge_to_qid = s_qids[number] merge(merge_to=merge_to_qid, merge_from=protein_qid, login_obj=login_obj) continue except ValueError: decision = input('\n\nProtein class? (p):\nProtein complex? (c):') try: if decision == 'p': data = [PBB_Core.WDItemID(value='Q417841', prop_nr='P31')] elif decision == 'c': data = [PBB_Core.WDItemID(value='Q14633939', prop_nr='P31')] else: continue wd_item = PBB_Core.WDItemEngine(wd_item_id=protein_qid, data=data) wd_item.write(login=login_obj) print('added protein class') except Exception as e: pprint.pprint(e) continue pass
def main(): print(sys.argv[1]) # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1]) PDBImageFix(login)
def main(): cid_wd_map = get_id_wd_map('P662') uniprot_wd_map = get_id_wd_map('P352') # pprint.pprint(cid_wd_map) interaction_types = { 'Agonist': 'Q389934', 'Inhibitor': 'Q427492', 'Allosteric modulator': 'Q2649417', 'Antagonist': 'Q410943', 'Channel blocker': 'Q5072487' } all_ligands = pd.read_csv('./iuphar_data/ligands.csv', header=0, sep=',', dtype={ 'PubChem CID': np.str, 'PubChem SID': np.str, 'Ligand id': np.str }, low_memory=False) all_interactions = pd.read_csv('./iuphar_data/interactions.csv', header=0, sep=',', dtype={ 'ligand_id': np.str, 'ligand_pubchem_sid': np.str }, low_memory=False) print(sys.argv[1]) login = PBB_login.WDLogin(user='******', pwd=sys.argv[1]) for count, uniprot_id in enumerate( all_interactions['target_uniprot'].unique()): if uniprot_id in uniprot_wd_map: uniprot_id_df = all_interactions[all_interactions['target_uniprot'] == uniprot_id] statements = list() for sid in uniprot_id_df['ligand_pubchem_sid']: try: cid = all_ligands.loc[all_ligands['PubChem SID'] == sid, 'PubChem CID'].iloc[0] iuphar_ligand = all_ligands.loc[ all_ligands['PubChem SID'] == sid, 'Ligand id'].iloc[0] itype = uniprot_id_df.loc[ uniprot_id_df['ligand_pubchem_sid'] == sid, 'type'].iloc[0] qualifier = [] if itype in interaction_types: qualifier.append( PBB_Core.WDItemID(value=interaction_types[itype], prop_nr='P366', is_qualifier=True)) if cid in cid_wd_map: # print(cid, 'will be added to', uniprot_id) compound_qid = cid_wd_map[cid] statements.append( PBB_Core.WDItemID( value=compound_qid, prop_nr='P129', references=generate_refs(iuphar_ligand), qualifiers=qualifier)) except IndexError as e: print('No CID found for:', sid, uniprot_id) continue if len(statements) == 0: continue try: print(len(statements)) item = PBB_Core.WDItemEngine( wd_item_id=uniprot_wd_map[uniprot_id], data=statements) item_qid = item.write(login) # pprint.pprint(item.get_wd_json_representation()) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print(e) for count, sid in enumerate( all_interactions['ligand_pubchem_sid'].unique()): try: cid = all_ligands.loc[all_ligands['PubChem SID'] == sid, 'PubChem CID'].iloc[0] except IndexError: continue if cid in cid_wd_map: sid_df = all_interactions[all_interactions['ligand_pubchem_sid'] == sid] statements = list() for uniprot in sid_df['target_uniprot']: try: # cid = all_ligands.loc[all_ligands['PubChem SID'] == sid, 'PubChem CID'].iloc[0] iuphar_ligand = all_ligands.loc[ all_ligands['PubChem SID'] == sid, 'Ligand id'].iloc[0] itype = sid_df.loc[sid_df['ligand_pubchem_sid'] == sid, 'type'].iloc[0] qualifier = [] if itype in interaction_types: qualifier.append( PBB_Core.WDItemID(value=interaction_types[itype], prop_nr='P794', is_qualifier=True)) if uniprot in uniprot_wd_map: # print(cid, 'will be added to', uniprot_id) uniprot_qid = uniprot_wd_map[uniprot] statements.append( PBB_Core.WDItemID( value=uniprot_qid, prop_nr='P129', references=generate_refs(iuphar_ligand), qualifiers=qualifier)) except IndexError as e: print('No Uniprot found for:', uniprot) continue if len(statements) == 0: continue try: print(len(statements)) item = PBB_Core.WDItemEngine(wd_item_id=cid_wd_map[cid], data=statements) item_qid = item.write(login) # pprint.pprint(item.get_wd_json_representation()) print('sucessfully written to', item_qid, item.get_label()) except Exception as e: print(e)
def main(): current_taxon_id = '' current_taxon_qid = '' def read_file(url): if not os.path.exists('./data'): os.makedirs('./data') file_name = url.split('/')[-1] file_path = './data/{}'.format(file_name) if not read_local or not os.path.isfile(file_path): requests_ftp.monkeypatch_session() s = requests.Session() if url.startswith('ftp://'): reply = s.retr(url, stream=True) else: reply = s.get(url, stream=True) with open(file_path, 'wb') as f: for chunk in reply.iter_content(chunk_size=2048): if chunk: f.write(chunk) f.flush() if file_name.endswith('.gz'): f = gzip.open(file_path, 'rt') else: f = open(file_path, 'rt') cnt = 0 while f: line = f.readline() if not line: break cnt += 1 if cnt % 100000 == 0: print('count: ', cnt) yield line def get_uniprot_for_entrez(): # query WD for all entrez IDs (eukaryotic) # query Uniprot for all high quality annotated Uniprots based on the entrez id. query = ''' SELECT * WHERE {{ ?gene wdt:P351 ?entrez . ?gene wdt:P703 wd:{} . OPTIONAL {{ {{?gene wdt:P354 ?hgnc_id .}} UNION {{?gene wdt:P671 ?mgi_id .}} }} }} '''.format(current_taxon_qid) results = PBB_Core.WDItemEngine.execute_sparql_query(query=query)['results']['bindings'] entrez_to_qid = dict() global res_id_to_entrez_qid res_id_to_entrez_qid = dict() for z in results: # ensure that the correct prefix exists so the identifier can be found in the Uniprot XML file entrez_qid = z['gene']['value'].split('/')[-1] entrez_id = z['entrez']['value'] res_id = '' if 'hgnc_id' in z: res_id = z['hgnc_id']['value'] if len(res_id.split(':')) <= 1: res_id = 'HGNC:' + res_id elif 'mgi_id' in z: res_id = z['mgi_id']['value'] if len(res_id.split(':')) <= 1: res_id = 'MGI:' + res_id entrez_to_qid[entrez_id] = (entrez_qid, res_id) res_id_to_entrez_qid.update({res_id: (entrez_qid, entrez_id)}) print('Wikidata Entrez query complete') uniprot_to_qid = get_all_wd_uniprots() print('Wikidata Uniprot query complete') up_prefix = ''' PREFIX taxon:<http://purl.uniprot.org/taxonomy/> PREFIX up:<http://purl.uniprot.org/core/> PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> ''' headers = { 'content-type': 'application/sparql-results+json', 'charset': 'utf-8' } up_query = ''' SELECT DISTINCT * WHERE {{ ?uniprot rdfs:seeAlso ?gene . ?uniprot up:reviewed ?reviewed . ?uniprot up:organism taxon:{} . FILTER regex(?gene, "^http://purl.uniprot.org/geneid/") }} GROUP BY ?uniprot ?gene ?reviewed '''.format(current_taxon_id) query_string = up_prefix + up_query data = { 'format': 'srj', 'query': query_string } uniprot_entrez_map_filename = 'uniprot_entrez_map_{}.json'.format(current_taxon_id) if read_local and os.path.isfile(uniprot_entrez_map_filename): with open(uniprot_entrez_map_filename, 'r') as f: results = json.load(f) else: reply = requests.post(url='http://sparql.uniprot.org/sparql/', params=data, headers=headers) results = reply.json() with open(uniprot_entrez_map_filename, 'w') as of: json.dump(results, of) print('Uniprot query complete') uniprot_map = dict() for ids in results['results']['bindings']: entrez_id = ids['gene']['value'].split('/')[-1] uniprot_id = ids['uniprot']['value'].split('/')[-1] reviewed = False if ids['reviewed']['value'] == 'true': reviewed = True if reviewed or (not reviewed and uniprot_id in uniprot_to_qid): if entrez_id not in entrez_to_qid: print('Entrez ID {} not in Wikidata'.format(entrez_id)) continue if uniprot_id not in uniprot_to_qid: protein_qid = '' else: protein_qid = uniprot_to_qid[uniprot_id] uniprot_map[uniprot_id] = { 'entrez': { 'id': entrez_id, 'qid': entrez_to_qid[entrez_id][0], 'res_id': entrez_to_qid[entrez_id][1] }, 'qid': protein_qid } # Uniprot items in WD without a link to a gene should also be updated, therefore add them to uniprot_map, # keep entrez empty. for wd_protein_item in uniprot_to_qid: if wd_protein_item not in uniprot_map: uniprot_map[wd_protein_item] = { 'entrez': { 'id': '', 'qid': '', 'res_id': '' }, 'qid': uniprot_to_qid[wd_protein_item] } return uniprot_map def get_all_wd_uniprots(): query = ''' SELECT * WHERE {{ ?protein wdt:P352 ?uniprot . ?protein wdt:P703 wd:{} . }} '''.format(current_taxon_qid) results = PBB_Core.WDItemEngine.execute_sparql_query(query=query)['results']['bindings'] wd_up_map = dict() for z in results: up = z['uniprot']['value'] qid = z['protein']['value'].split('/')[-1] # Make sure to reliably detect duplicate Uniprot IDs in Wikidata. # For performance reasons, this is done here and not by using PBB_core. if up in wd_up_map: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format( main_data_id='{}'.format(up), exception_type='Duplicate Uniprot ID error.', message='Duplicate Uniprot IDs in Wikidata. Cleanup required!!', wd_id=qid, duration=time.time() )) else: wd_up_map.update({up: qid}) return wd_up_map def get_go_map(): query = ''' SELECT * WHERE { ?qid wdt:P686 ?go . } ''' results = PBB_Core.WDItemEngine.execute_sparql_query(query=query)['results']['bindings'] go_to_qid = dict() for z in results: go_to_qid[z['go']['value']] = { 'qid': z['qid']['value'].split('/')[-1], 'go_class_prop': '' } return go_to_qid def get_pdb_to_uniprot(): file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_pdb.csv.gz' pdb_uniprot_map = dict() for c, line in enumerate(read_file(file)): if c < 2: print(line) continue dt = line.strip('\n').split(',') if dt[0] not in pdb_uniprot_map: pdb_uniprot_map[dt[0]] = dt[1].split(';') return pdb_uniprot_map def const_go_map(): base_dict = { 'go_terms': list(), 'evidence': list(), 'pdb': set() } file = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_go.csv.gz' pdb_go_map = dict() for c, line in enumerate(read_file(file)): if c < 2: print(line) c += 1 continue dt = line.strip('\n').split(',') uniprot = copy.copy(dt[2]) if uniprot not in base_map: continue if uniprot not in pdb_go_map: pdb_go_map[uniprot] = copy.deepcopy(base_dict) pdb_go_map[uniprot]['go_terms'].append(dt[-1]) pdb_go_map[uniprot]['evidence'].append(dt[-2]) pdb_go_map[uniprot]['pdb'].add(dt[0]) print('total number of PDBs', len(pdb_go_map)) pdb_to_uniprot = get_pdb_to_uniprot() for uniprot in pdb_to_uniprot: if uniprot in pdb_go_map: pdb_go_map[uniprot]['pdb'].update(pdb_to_uniprot[uniprot]) else: pdb_go_map[uniprot] = copy.deepcopy(base_dict) pdb_go_map[uniprot]['pdb'] = set(pdb_to_uniprot[uniprot]) entrez_to_uniprot = {base_map[z]['entrez']['id']: z for z in base_map if base_map[z]['entrez']['id']} # Download and process latest human and mouse GO term annotation files # files = [ # 'http://geneontology.org/gene-associations/gene_association.goa_human.gz', # 'http://geneontology.org/gene-associations/gene_association.mgi.gz' # ] # # for file in files: # for line in read_file(file): # if line.startswith('!'): # continue # # cols = line.split('\t') # uniprot = cols[1] # go_id = cols[4] # evidence = cols[6] # go_class = cols[8] # # if cols[0] == 'MGI': # try: # mgi = cols[1] # entrez = res_id_to_entrez_qid[mgi][1] # uniprot = entrez_to_uniprot[entrez] # except KeyError: # continue # # if uniprot not in pdb_go_map: # pdb_go_map[uniprot] = copy.deepcopy(base_dict) # # pdb_go_map[uniprot]['go_terms'].append(go_id) # pdb_go_map[uniprot]['evidence'].append(evidence) # # try: # go_prop_map[go_id]['go_class_prop'] = ProteinBot.get_go_class(go_id, go_class) # except KeyError: # print('GO term {} not yet in Wikidata'.format(go_id)) # continue return pdb_go_map parser = argparse.ArgumentParser(description='ProteinBot parameters') parser.add_argument('--run-locally', action='store_true', help='Locally stored data files and run progress ' 'will be used. Acts also as if continuing a run.') parser.add_argument('--user', action='store', help='Username on Wikidata', required=True) parser.add_argument('--pwd', action='store', help='Password on Wikidata', required=True) parser.add_argument('--taxon-ids', action='store', help='Taxonomy IDs for the species the proteins should be written. Enter separated by a colon!' 'e.g. 9606,10090 for human and mouse') args = parser.parse_args() read_local = args.run_locally login = PBB_login.WDLogin(user=args.user, pwd=args.pwd) taxon_ids = [x.strip() for x in args.taxon_ids.split(',')] if len(taxon_ids) == 0: print('No taxon IDs given, falling back to human (9606) and mouse (10090)') taxon_ids = ['9606', '10090'] for ti in taxon_ids: current_taxon_id = ti current_taxon_qid = ProteinBot.taxon_map[ti] progress_file_name = 'uniprot_progress_taxon_{}.json'.format(ti) # generate a basic mapping of Uniprot to Entrez and Wikidata genes and proteins base_map = get_uniprot_for_entrez() # generate mappings of GO terms to their Wikidata QIDs go_prop_map = get_go_map() # generate a map of Uniprot IDs with the matches PDB IDs, GO term and GO evidence codes pdb_to_go = const_go_map() if read_local and os.path.isfile(progress_file_name): with open(progress_file_name, 'r') as infile: progress = json.load(infile) else: progress = dict() for count, x in enumerate(base_map): if x in progress: continue pprint.pprint(x) pprint.pprint(base_map[x]) ProteinBot(uniprot=x, base_map=base_map, pdb_to_go=pdb_to_go, go_prop_map=go_prop_map, login=login, progress=progress, fast_run=True) with open(progress_file_name, 'w') as outfile: json.dump(progress, outfile)
PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> SELECT * WHERE { ?diseases wdt:P699 "DOID:""" + doid + """\" } """) sparql.setReturnFormat(JSON) results = sparql.query().convert() disease_wdid = results['results']['bindings'][0]['diseases'][ 'value'].split("/")[4] if results['results']['bindings'][0]['diseases']['value']: login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), os.environ['wikidataApi']) if not (values["Gene Symbol"] in gnsym_gemma_ids): gemmaGeneIds = "http://sandbox.chibi.ubc.ca/Gemma/rest/phenotype/find-candidate-genes?phenotypeValueUris=" + values[ "Phenotype URIs"] result = requests.get(gemmaGeneIds, stream=True).json() for item in result: gnsym_gemma_ids[ item['officialSymbol']] = item['id'] refURL = PBB_Core.WDUrl( value= 'http://chibi.ubc.ca/Gemma/phenotypes.html?phenotypeUrlId=DOID_' + doid + '&geneId=' + str(gnsym_gemma_ids[values["Gene Symbol"]]), prop_nr='P854', is_reference=True)
PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> SELECT * WHERE { ?diseases wdt:P699 "DOID:""" + doid + """\" } """) sparql.setReturnFormat(JSON) results = sparql.query().convert() pprint.pprint(results) # The current Disease Ontology term exists in Wikidata if len(results['results']['bindings'])!=0: disease_wdid = results['results']['bindings'][0]['diseases']['value'].split("/")[4] if results['results']['bindings'][0]['diseases']['value']: login = PBB_login.WDLogin(PBB_settings.getWikiDataUser(), PBB_settings.getWikiDataPassword()) # put back in when using Jenkins: os.environ['wikidataApi'] # Only hit the API endpoint if we do not already have the gene symbol to Gemma ID mapping if not (values["Gene Symbol"] in gnsym_gemma_ids): gemmaGeneIds = "http://sandbox.chibi.ubc.ca/Gemma/rest/phenotype/find-candidate-genes?phenotypeValueUris="+doid_url result = requests.get(gemmaGeneIds, stream=True).json() for item in result: gnsym_gemma_ids[item['officialSymbol']] = item['id'] # not doing for now, until duplicate detection exists (for using qual) # writing diseases to genes refURL = PBB_Core.WDUrl(value='http://chibi.ubc.ca/Gemma/phenotypes.html?phenotypeUrlId=DOID_'+doid+'&geneId='+str(gnsym_gemma_ids[values["Gene Symbol"]]), prop_nr='P854', is_reference=True) refURL2 = PBB_Core.WDUrl(value=values["Web Link"], prop_nr='P854', is_reference=True) refImported = PBB_Core.WDItemID(value='Q22330995', prop_nr='P143', is_reference=True) refImported.overwrite_references = True refStated = PBB_Core.WDItemID(value='Q22978334', prop_nr='P248', is_reference=True) timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True)
def main(): print(sys.argv[1]) # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1]) GOCleaner(login)
def main(): print(sys.argv[1]) # pwd = input('Password:'******'ProteinBoxBot', pwd=sys.argv[1]) GeneWikiStubMerger(login)