def gene_item_statements(): """ construct list of referenced statements to past to PBB_Core Item engine :return: """ # creates reference object for WD gene item claim ncbi_gene_reference = wdo.reference_store( source='ncbi_gene', identifier=gene_record['_id']) # claims for datatype string. WD_String_CLAIMS = { 'P351': str(gene_record['_id']), 'P2393': gene_record['locus_tag'], } WD_Genome_Annotation_Claims = { 'P644': str(int(gene_record['genomic_pos']['start'])), 'P645': str(int(gene_record['genomic_pos']['end'])), } # claims for datytpe item WD_Item_CLAIMS = { 'P703': spec_strain.iloc[0]['wd_qid'], 'P279': 'Q7187', } # convert integer representation of strand to corresponding WD item (Forward Strand/Reverse Strand) if gene_record['genomic_pos']['strand'] == 1: WD_Item_CLAIMS['P2548'] = 'Q22809680' elif gene_record['genomic_pos']['strand'] == -1: WD_Item_CLAIMS['P2548'] = 'Q22809711' chromosome = gene_record['genomic_pos']['chr'] rs_chrom = PBB_Core.WDString(value=chromosome, prop_nr='P2249', is_qualifier=True) statements = [] # process to pbb_Core data value object and append to statments for each valid item in each datatype dict # WDItemID datatype for k, v in WD_Item_CLAIMS.items(): statements.append( PBB_Core.WDItemID(value=v, prop_nr=k, references=[ncbi_gene_reference])) # WDString datatype for k, v in WD_String_CLAIMS.items(): statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[ncbi_gene_reference])) for k, v in WD_Genome_Annotation_Claims.items(): statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[ncbi_gene_reference], qualifiers=[rs_chrom])) return statements
def make_reference(self, stated_in, source_element, source_element_name, source_element_prop, date=time.strftime('+%Y-%m-%dT00:00:00Z'), date_property='P813'): ref = [[ PBB_Core.WDItemID(value=stated_in, prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=source_element, prop_nr=source_element_prop, is_reference=True), # source element PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work PBB_Core.WDMonolingualText(value=source_element_name, language='en', prop_nr='P1476', is_reference=True), PBB_Core.WDTime(time=date, prop_nr=date_property, is_reference=True) # publication date ]] # this will overwrite all existing references of a WD claim value. for x in ref[0]: x.overwrite_references = True return ref
def protein_item_statements(): """ construct list of referenced statements to pass to PBB_Core Item engine :return: """ uniprot_ref = wdo.reference_store(source='uniprot', identifier=uniprot) WD_String_CLAIMS = { 'P637': str(gene_record['refseq']['protein']), # set refseq protein id 'P352': uniprot # Set uniprot ID } WD_Item_CLAIMS = { 'P703': [spec_strain.iloc[0]['wd_qid'] ], # get strain taxid qid from strain record 'P279': ['Q8054'], # subclass of protein } statements = [] #generate go term claims for gt in gene_record['GOTERMS']: goprop = go_props[gt[1]] govalue = wdo.WDSparqlQueries( prop='P686', string=gt[0]).wd_prop2qid() # Get GeneOntology Item by GO ID evprop = 'P459' try: evvalue = go_evidence_codes[gt[2]] evstat = PBB_Core.WDItemID(value=evvalue, prop_nr=evprop, is_qualifier=True) statements.append( PBB_Core.WDItemID(value=govalue, prop_nr=goprop, references=[uniprot_ref], qualifiers=[evstat])) except Exception as e: statements.append( PBB_Core.WDItemID(value=govalue, prop_nr=goprop, references=[uniprot_ref])) # generate list of pbb core value objects for all valid claims for k, v in WD_Item_CLAIMS.items(): if v: for i in v: statements.append( PBB_Core.WDItemID(value=i, prop_nr=k, references=[uniprot_ref])) for k, v in WD_String_CLAIMS.items(): if v: statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[uniprot_ref])) return statements
def reference_store(source='', identifier=''): """ :param source: database source to be referenced (key name from source_qids) :param ref_type: type of WD reference statement (imported from, stated in) (key names from prop_ids) :return: PBB_Core reference object for database source """ source_items = {'uniprot': 'Q905695', 'ncbi_gene': 'Q20641742', 'ncbi_taxonomy': 'Q13711410', 'swiss_prot': 'Q2629752', 'trembl': 'Q22935315'} prop_ids = {'uniprot': 'P352', 'ncbi_gene': 'P351', 'ncbi_taxonomy': 'P685', 'ncbi_locus_tag': 'P2393' } refs = [PBB_Core.WDItemID(value=source_items[source], prop_nr='P248', is_reference=True), PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), PBB_Core.WDString(value=identifier, prop_nr=prop_ids[source], is_reference=True), PBB_Core.WDTime(str(strftime("+%Y-%m-%dT00:00:00Z", gmtime())), prop_nr='P813', is_reference=True) ] for ref in refs: ref.overwrite_references = True return refs
def __init__(self, object): self.logincreds = object["logincreds"] self.name = object["uberonLabel"] self.uberon = object["uberon"] self.uberon_id = self.uberon.replace("http://purl.obolibrary.org/obo/UBERON_", "") self.wikidata_id = object["wikidata_id"] self.start = object["start"] self.graph = object["graph"] subcls = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") id = URIRef("http://www.geneontology.org/formats/oboInOwl#id") hasExactSyn = URIRef("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym") print(self.uberon_id) print(self.name) refStatedIn = PBB_Core.WDItemID(21552738, prop_nr='P248', is_reference=True) refStatedIn.overwrite_references = True refImported = PBB_Core.WDItemID(value=7876491, prop_nr='P143', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True ub_reference = [refStatedIn, refImported, refRetrieved] if self.uberon_id in self.wikidata_id.keys(): self.wdid = self.wikidata_id[self.uberon_id.replace("UBERON:", "")] else: self.wdid = None self.synonyms = [] for synonym in self.graph.objects(URIRef(self.uberon), hasExactSyn): self.synonyms.append(str(synonym)) prep = dict() prep["P279"] = [PBB_Core.WDItemID(value='Q4936952', prop_nr='P279', references=[copy.deepcopy(ub_reference)])] prep["P1554"] = [PBB_Core.WDString(value=self.uberon_id, prop_nr='P1554', references=[copy.deepcopy(ub_reference)])] print(self.uberon) prep["P1709"] = [PBB_Core.WDUrl(value=self.uberon, prop_nr='P1709', references=[copy.deepcopy(ub_reference)])] data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is not None: wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure",append_value=['P279']) else: wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="anatomical_structure", append_value=['P279']) if len(self.synonyms) >0: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) print(self.synonyms) for syn in self.synonyms: print(syn) wdPage.write(self.logincreds) print("======") sys.exit()
def create_reference(self): first_ref = PBB_Core.WDItemID(value='Q905695', prop_nr='P248', is_reference=True) first_ref.overwrite_references = True return [ first_ref, PBB_Core.WDString(value=self.uniprot, prop_nr='P352', is_reference=True), PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z', time.gmtime()), prop_nr='P813', is_reference=True), PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work ]
def generate_refs(ref_source_id): ref_list = [[]] if ref_source_id.startswith('C'): ref_list[0].extend([ PBB_Core.WDItemID(value='Q6120337', prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=ref_source_id, prop_nr='P592', is_reference=True), # source element ]) elif ref_source_id.startswith('N'): ref_list[0].extend([ PBB_Core.WDItemID(value='Q21008030', prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=ref_source_id, prop_nr='P2115', is_reference=True), # source element ]) ref_list[0].extend([ PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work # PBB_Core.WDMonolingualText(value=source_element_name, language='en', # prop_nr='P1476', is_reference=True), PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # publication date ]) return ref_list
def protein_item_statements(): """ construct list of referenced statements to past to PBB_Core Item engine :return: """ uniprot_ref = wdo.reference_store(source='uniprot', identifier=uniprot) WD_String_CLAIMS = { 'P637': str(gene_record['refseq']['protein']), #'P2393': gene_record['locus_tag'], 'P352': uniprot #'P591': str(gene_record['EC number']) } WD_Item_CLAIMS = { 'P703': [spec_strain.iloc[0]['wd_qid']], 'P279': ['Q8054'], 'P680': [], # molecular function 'P681': [], # cellular component 'P682': [] # biological process } for gt in gene_record['GOTERMS']: gtids = parse_go_terms(gt) WD_Item_CLAIMS[gtids[1]].append(gtids[0]) statements = [] # generate list of pbb core value objects for all valid claims for k, v in WD_Item_CLAIMS.items(): if v: for i in v: statements.append( PBB_Core.WDItemID(value=i, prop_nr=k, references=[uniprot_ref])) for k, v in WD_String_CLAIMS.items(): if v: statements.append( PBB_Core.WDString(value=v, prop_nr=k, references=[uniprot_ref])) return statements
def generate_refs(iuphar_ligand): ref_list = [[]] ref_list[0].extend([ PBB_Core.WDItemID(value='Q2793172', prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=iuphar_ligand, prop_nr='P595', is_reference=True), # source element ]) ref_list[0].extend([ PBB_Core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work # PBB_Core.WDMonolingualText(value=source_element_name, language='en', # prop_nr='P1476', is_reference=True), PBB_Core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # publication date ]) return ref_list
def __init__(self, object): """ constructor :param wd_do_content: Wikidata item id :param do_id: Identifier of the disease in Disease Ontology :param label: Primary label of the disease in Disease Ontology :param synonyms: All synonyms for the disease captured in the Disease Ontology :param xrefs: a dictionary with all external references of the Disease captured in the Disease Ontology """ # Reference section doVersionURL = object[1] doClass = object[0] self.logincreds = object[3] self.wd_doMappings = object[2] self.start = object[4] self.wd_do_content = doClass PBB_Debug.prettyPrint(self.wd_do_content) self.do_id = self.getDoValue(self.wd_do_content, './/oboInOwl:id')[0].text print(self.do_id) self.name = self.getDoValue(self.wd_do_content, './/rdfs:label')[0].text print(self.name) classDescription = self.getDoValue( self.wd_do_content, './/oboInOwl:hasDefinition/oboInOwl:Definition/rdfs:label') if len(classDescription) > 0: self.description = classDescription[0].text if self.do_id in object[2].keys(): self.wdid = "Q" + str(object[2][self.do_id]) else: self.wdid = None if len(self.getDoValue(self.wd_do_content, './/owl:deprecated')) > 0 and self.getDoValue( self.wd_do_content, './/owl:deprecated')[0].text == "true": self.rank = "deprecated" else: self.rank = "normal" self.synonyms = [] for synonym in self.getDoValue(self.wd_do_content, './/oboInOwl:hasExactSynonym'): self.synonyms.append(synonym.text) self.subclasses = [] for subclass in self.getDoValue(self.wd_do_content, './/rdfs:subClassOf'): parts = subclass.get( '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource').split( "DOID_") if len(parts) > 1: self.subclasses.append("DOID:" + parts[1]) if "DOID:4" in self.subclasses: self.subclasses.remove("DOID:4") self.xrefs = dict() for xref in self.getDoValue(self.wd_do_content, './/oboInOwl:hasDbXref'): if not xref.text.split(":")[0] in self.xrefs.keys(): self.xrefs[xref.text.split(":")[0]] = [] self.xrefs[xref.text.split(":")[0]].append(xref.text.split(":")[1]) refStatedIn = PBB_Core.WDUrl(value=doVersionURL, prop_nr='P1065', is_reference=True) refStatedIn.overwrite_references = True refImported = PBB_Core.WDItemID(value=5282129, prop_nr='P248', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True do_reference = [refImported, refRetrieved, refStatedIn] prep = dict() prep["P279"] = [ PBB_Core.WDItemID(value='Q12136', prop_nr='P279', references=[copy.deepcopy(do_reference)], rank=self.rank) ] # Subclass of disease for subclass in self.subclasses: if subclass in self.wd_doMappings.keys(): prep["P279"].append( PBB_Core.WDItemID(value=self.wd_doMappings[subclass], prop_nr='P279', references=[copy.deepcopy(do_reference)], rank=self.rank)) if "Orphanet" in self.xrefs.keys(): prep["P1550"] = [] if isinstance(self.xrefs["Orphanet"], list): for id in self.xrefs["Orphanet"]: prep["P1550"].append( PBB_Core.WDString( value=self.xrefs["Orphanet"], prop_nr='P1550', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P1550"] = [ PBB_Core.WDString(value=self.xrefs["Orphanet"], prop_nr='P1550', references=[copy.deepcopy(do_reference)], rank=self.rank) ] #disease Ontology prep["P699"] = [ PBB_Core.WDString(value=self.do_id, prop_nr='P699', references=[do_reference], rank=self.rank) ] if "url" in self.xrefs.keys(): if isinstance(self.xrefs["url"], list): for i in self.xrefs["url"]: if "//en.wikipedia.org/wiki/" in i: wikilink = self.i.replace("//en.wikipedia.org/wiki/", "").replace("_", "") else: wikilink = None else: if "//en.wikipedia.org/wiki/" in xrefs["url"]: wikilink = xrefs["url"].replace("//en.wikipedia.org/wiki/", "").replace("_", "") else: wikilink = None else: wikilink = None if "ICD10CM" in self.xrefs.keys(): prep["P494"] = [] if isinstance(self.xrefs["ICD10CM"], list): for id in self.xrefs["ICD10CM"]: prep["P494"].append( PBB_Core.WDString( value=id, prop_nr='P494', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P494"] = [ PBB_Core.WDString(value=self.xrefs["ICD10CM"], prop_nr='P494', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "ICD9CM" in self.xrefs.keys(): prep["P493"] = [] if isinstance(self.xrefs["ICD9CM"], list): for id in self.xrefs["ICD9CM"]: prep["P493"].append( PBB_Core.WDString( value=id, prop_nr='P493', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P493"] = [ PBB_Core.WDString(value=self.xrefs["ICD9CM"], prop_nr='P493', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "MSH" in self.xrefs.keys(): prep["P486"] = [] if isinstance(self.xrefs["MSH"], list): for id in self.xrefs["MSH"]: prep["P486"].append( PBB_Core.WDString( value=id, prop_nr='P486', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P486"] = [ PBB_Core.WDString(value=self.xrefs["MSH"], prop_nr='P486', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "NCI" in self.xrefs.keys(): prep["P1748"] = [] if isinstance(self.xrefs["NCI"], list): for id in self.xrefs["NCI"]: prep["P1748"].append( PBB_Core.WDString( value=id, prop_nr='P1748', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P1748"] = [ PBB_Core.WDString(value=self.xrefs["NCI"], prop_nr='P1748', references=[copy.deepcopy(do_reference)], rank=self.rank) ] if "OMIM" in self.xrefs.keys(): prep["P492"] = [] if isinstance(self.xrefs["OMIM"], list): for id in self.xrefs["OMIM"]: prep["P492"].append( PBB_Core.WDString( value=id, prop_nr='P492', references=[copy.deepcopy(do_reference)], rank=self.rank)) else: prep["P492"] = [ PBB_Core.WDString(value=self.xrefs["OMIM"], prop_nr='P492', references=[copy.deepcopy(do_reference)], rank=self.rank) ] print(self.wdid) data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is not None: wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="diseases", append_value=['P279']) else: wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="diseases", append_value=['P279']) # wdPage.set_description(description='Human disease', lang='en') if wikilink is not None: wdPage.set_sitelink(site="enwiki", title=wikilink) if self.synonyms is not None: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) self.wd_json_representation = wdPage.get_wd_json_representation() PBB_Debug.prettyPrint(self.wd_json_representation) wdPage.write(self.logincreds) if not os.path.exists('./json_dumps'): os.makedirs('./json_dumps') f = open('./json_dumps/' + self.do_id.replace(":", "_") + '.json', 'w+') pprint.pprint(self.wd_json_representation, stream=f) f.close() PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.do_id, exception_type='', message=f.name, wd_id=self.wdid, duration=time.time() - self.start))
strand = 'Q22809711' genes = line[2:] for gene in genes: for lgene in list_genes: if gene == lgene['symbol']: lgene['operon'] = {'operon': operon, 'strand': strand} f.close() return list_genes ops = combine_resources() #pprint.pprint(ops) genestot = len(ops) count = 0 reference = [ PBB_Core.WDString(value='19448609', prop_nr='P698', is_reference=True), PBB_Core.WDTime(str(strftime("+%Y-%m-%dT00:00:00Z", gmtime())), prop_nr='P813', is_reference=True) ] for ref in reference: ref.overwrite_references = True login = PBB_login.WDLogin(sys.argv[1], sys.argv[2]) for gene in ops: statements = [] if 'locus_tag' in gene.keys(): item_name = '{} {}'.format(gene['name'], gene['locus_tag']) if 'operon' in gene.keys(): count += 1
def __init__(self, login): self.login_obj = login # wdq_results = PBB_Core.WDItemList('CLAIM[686]', '686').wditems # wd_go_terms = list(map(lambda z: z[2], wdq_results['props']['686'])) # go_qid_list = list(map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['686'])) query = ''' SELECT distinct ?gene ?go WHERE { ?gene wdt:P686 ?go . FILTER(!REGEX(?go, "^GO:[0-9]", "i")) } ''' qids_to_clean = set() for x in PBB_Core.WDItemEngine.execute_sparql_query( query=query)['results']['bindings']: qids_to_clean.add(x['gene']['value'].split('/')[-1]) # print(len(wd_go_terms)) # for count, go_term in enumerate(wd_go_terms): # curr_qid = go_qid_list[wd_go_terms.index(go_term)] # # # try: # # int(go_term) # # except ValueError as e: # qids_to_clean.add(curr_qid) for count, curr_qid in enumerate(qids_to_clean): start = time.time() clean_gos = [] print(curr_qid) cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid) for wd_value in cleanup_item.statements: if wd_value.get_prop_nr() == 'P686': go_value = wd_value.get_value() # int(go_value) if not go_value.startswith('GO'): clean_gos.append( PBB_Core.WDString(value='GO:' + go_value, prop_nr='P686')) try: go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid, data=clean_gos) # pprint.pprint(go_item.get_wd_json_representation()) go_item.write(self.login_obj) PBB_Core.WDItemEngine.log( 'INFO', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type='', message='success', wd_id=curr_qid, duration=time.time() - start)) print(count, 'success', curr_qid, go_item.get_label(lang='en')) except Exception as e: print(count, 'error', curr_qid) PBB_Core.WDItemEngine.log( 'ERROR', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type=type(e), message=e.__str__(), wd_id=curr_qid, duration=time.time() - start))
def __init__(self, object): # Populate variables with different values self.geneSymbols = object["geneSymbols"] self.logincreds = object["logincreds"] self.goTerms = object["goTerms"] self.version = object["results"]["bindings"][0]["upversion"]["value"] self.uniprot = object["results"]["bindings"][0]["uniprot"]["value"] self.uniprotId = object["id"] self.name = object["results"]["bindings"][0]["plabel"]["value"] self.start = object["start"] self.entrezWikidataIds = object["entrezWikidataIds"] up_in_wd = search_wd(self.name) self.wdid = None hits = [] for result in up_in_wd["search"]: if result["match"]["text"] == up_in_wd["searchinfo"]["search"]: hits.append(result) print(result["match"]["text"]) if len(hits) > 0: valid = [] for hit in hits: hitPage = PBB_Core.WDItemEngine(item_name=hit["label"], wd_item_id=hit["id"], data=[], server="www.wikidata.org", domain="proteins") json_rep = hitPage.get_wd_json_representation() proteinClaim = False geneClaim = False speciesClaim = False if "P279" in json_rep["claims"].keys(): for it in json_rep["claims"]["P279"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8054: proteinClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 7187: geneClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 407355: proteinClaim = True break if "P31" in json_rep["claims"].keys(): for it in json_rep["claims"]["P31"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8047: proteinClaim = True break if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 8054: proteinClaim = True break if "P703" in json_rep["claims"].keys(): for it in json_rep["claims"]["P703"]: if it["mainsnak"]["datavalue"]["value"][ "numeric-id"] == 5: speciesClaim = True break if len(json_rep["claims"]) == 0: raise Exception(hit["id"] + " has an indentical label as " + self.uniprotId + ", but with no claims") elif ("P352" in json_rep["claims"].keys() or "P705" in json_rep["claims"].keys() or proteinClaim): valid.append(hit["id"]) elif geneClaim: self.wdid = None else: raise Exception(hit["id"] + " has an identical label as " + self.uniprotId + " but with no valid protein claims") if len(valid) == 1: self.wdid = valid[0] elif len(valid) > 1: raise Exception( self.uniprotId + " There are multiple valid Wikidata items that might be applicable. " + str(valid)) if "gene_id" in object["results"]["bindings"][0].keys(): self.gene_id = [] for geneId in object["results"]["bindings"][0]["gene_id"][ "value"].split(";"): if geneId != "": self.gene_id.append(geneId) if "ecName" in object["results"]["bindings"][0].keys(): self.ecname = [] self.ecname.append( object["results"]["bindings"][0]["ecName"]["value"]) self.alias = [] for syn in object["results"]["bindings"][0]["upalias"]["value"].split( ";"): if syn != "": self.alias.append(syn) if "pdbid" in object["results"]["bindings"][0].keys( ) and object["results"]["bindings"][0]["pdbid"]["value"] != "": self.pdb = [] for pdbId in object["results"]["bindings"][0]["pdbid"][ "value"].split(";"): self.pdb.append( pdbId.replace("http://rdf.wwpdb.org/pdb/", "").replace(" ", "")) if "refseqid" in object["results"]["bindings"][0].keys(): self.refseq = [] for refseqId in object["results"]["bindings"][0]["refseqid"][ "value"].split(";"): self.refseq.append( refseqId.replace("http://purl.uniprot.org/refseq/", "").replace(" ", "")) if "ensemblp" in object["results"]["bindings"][0].keys( ) and object["results"]["bindings"][0]["ensemblp"]["value"] != "": self.ensemblp = [] for ensP in object["results"]["bindings"][0]["ensemblp"][ "value"].split(";"): self.ensemblp.append( ensP.replace("http://purl.uniprot.org/ensembl/", "").replace(" ", "")) # Prepare references refStatedIn = PBB_Core.WDItemID(value=2629752, prop_nr='P248', is_reference=True) refStatedIn.overwrite_references = True refURL = "http://www.uniprot.org/uniprot/" + self.uniprotId + ".txt?version=" + str( self.version) refReferenceURL = PBB_Core.WDUrl(value=refURL, prop_nr='P854', is_reference=True) refReferenceURL.overwrite_references = True refImported = PBB_Core.WDItemID(value=905695, prop_nr='P143', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True protein_reference = [[ refStatedIn, refImported, refRetrieved, refReferenceURL ]] references = dict() proteinPrep = dict() genePrep = dict() # P279 = subclass of proteinPrep['P279'] = [ PBB_Core.WDItemID(value="Q8054", prop_nr='P279', references=protein_reference) ] # P703 = found in taxon proteinPrep['P703'] = [ PBB_Core.WDItemID(value="Q5", prop_nr='P703', references=protein_reference) ] # P352 = UniprotID proteinPrep['P352'] = [ PBB_Core.WDString(value=self.uniprotId, prop_nr='P352', references=protein_reference) ] # P591 = ec number if "ecname" in vars(self): proteinPrep['P591'] = [] for i in range(len(self.ecname)): proteinPrep['P591'].append( PBB_Core.WDString(value=self.ecname[i], prop_nr='P591', references=protein_reference)) # P638 = PDBID if "pdb" in vars(self) and len(self.pdb) > 0: proteinPrep['P638'] = [] for i in range(len(self.pdb)): proteinPrep['P638'].append( PBB_Core.WDString(value=self.pdb[i], prop_nr='P638', references=protein_reference)) # P637 = Refseq Protein ID if "refseq" in vars(self) and len(self.refseq) > 0: proteinPrep['P637'] = [] for i in range(len(self.refseq)): proteinPrep['P637'].append( PBB_Core.WDString(value=self.refseq[i], prop_nr='P637', references=protein_reference)) # P705 = Ensembl Protein ID if "ensemblp" in vars(self) and len(self.ensemblp) > 0: proteinPrep['P705'] = [] for i in range(len(self.ensemblp)): proteinPrep['P705'].append( PBB_Core.WDString(value=self.ensemblp[i], prop_nr='P705', references=protein_reference)) """ # P686 = Gene Ontology ID proteinPrep["P680"] = [] proteinPrep["P681"] = [] proteinPrep["P682"] = [] for result in self.goTerms["results"]["bindings"]: statement = [ PBB_Core.WDString(value=result["go"]["value"].replace("http://purl.obolibrary.org/obo/GO_", "GO:"), prop_nr='P686', references=protein_reference)] goWdPage = PBB_Core.WDItemEngine(item_name=result["goLabel"]["value"], data=statement, server="www.wikidata.org", domain="proteins") if goWdPage.get_description() == "": goWdPage.set_description("Gene Ontology term") js = goWdPage.get_wd_json_representation() goWdId = goWdPage.write(self.logincreds) if result["parentLabel"]["value"] == "molecular_function": exists = False for i in range(len(proteinPrep["P680"])): if proteinPrep["P680"][i].value == goWdId: exists = True if not exists: proteinPrep["P680"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P680', references=protein_reference)) if result["parentLabel"]["value"] == "cellular_component": exists = False for i in range(len(proteinPrep["P681"])): if proteinPrep["P681"][i].value == goWdId: exists = True if not exists: proteinPrep["P681"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P681', references=protein_reference)) if result["parentLabel"]["value"] == "biological_process": exists = False for i in range(len(proteinPrep["P682"])): if proteinPrep["P682"][i].value == goWdId: exists = True if not exists: proteinPrep["P682"].append( PBB_Core.WDItemID(value=goWdId, prop_nr='P682', references=protein_reference)) """ # P702 = Encoded by if "gene_id" in vars(self) and len(self.gene_id) > 0: proteinPrep['P702'] = [] proteinPrep['P702'].append( PBB_Core.WDItemID( value=self.entrezWikidataIds[self.gene_id[0].replace( "http://purl.uniprot.org/geneid/", "").replace(" ", "")], prop_nr='P702', references=protein_reference)) proteinData2Add = [] for key in proteinPrep.keys(): for statement in proteinPrep[key]: proteinData2Add.append(statement) print(statement.prop_nr, statement.value) if self.wdid is None: wdProteinpage = PBB_Core.WDItemEngine(item_name=self.name, data=proteinData2Add, server="www.wikidata.org", domain="proteins", append_value=['P279']) else: wdProteinpage = PBB_Core.WDItemEngine(wd_item_id=self.wdid, item_name=self.name, data=proteinData2Add, server="www.wikidata.org", domain="proteins", append_value=['P279']) if len(self.alias) > 0: wdProteinpage.set_aliases(aliases=self.alias, lang='en', append=True) if wdProteinpage.get_description() == "": wdProteinpage.set_description(description='human protein', lang='en') if wdProteinpage.get_description(lang="de") == "": wdProteinpage.set_description(description='humanes Protein', lang='de') if wdProteinpage.get_description(lang="nl") == "": wdProteinpage.set_description(description='menselijk eiwit', lang='nl') if wdProteinpage.get_description( lang="fr") == "" or wdProteinpage.get_description( lang="fr") == "protéine": wdProteinpage.set_description(description='protéine humaine', lang='fr') self.wd_json_representation = wdProteinpage.get_wd_json_representation( ) PBB_Debug.prettyPrint(self.wd_json_representation) wdProteinpage.write(self.logincreds) print(wdProteinpage.wd_item_id) if not os.path.exists('./json_dumps'): os.makedirs('./json_dumps') f = open('./json_dumps/' + self.uniprotId + '.json', 'w+') pprint.pprint(self.wd_json_representation, stream=f) f.close() PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.uniprotId, exception_type='', message=f.name, wd_id=self.wdid, duration=time.time() - self.start)) print("===============")
def __init__(self, login, prop_nr, prefix_str, separator=':'): """ A class to take care of fixing certain identifer prefixes :param login: The Wikidata login object instance of PBB_login.WDLogin() :param prop_nr: the property number of the identifier the prefix should be fixed for :param prefix_str: the prefix string. e.g. 'GO', 'DOID' :param separator: the separator character between prefix and string """ self.login_obj = login query = ''' SELECT distinct ?s ?id WHERE {{ ?s wdt:{0} ?id . FILTER(!REGEX(?id, "^{1}{2}[0-9]", "i")) }} '''.format(prop_nr, prefix_str, separator) qids_to_clean = set() for x in PBB_Core.WDItemEngine.execute_sparql_query( query=query)['results']['bindings']: qids_to_clean.add(x['s']['value'].split('/')[-1]) print('Cleaning up', len(qids_to_clean), 'items.') for count, curr_qid in enumerate(qids_to_clean): start = time.time() clean_gos = [] print(curr_qid) cleanup_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid) for wd_value in cleanup_item.statements: if wd_value.get_prop_nr() == prop_nr: go_value = wd_value.get_value() if not go_value.startswith(prefix_str): clean_gos.append( PBB_Core.WDString(value=prefix_str + separator + go_value, prop_nr=prop_nr)) try: go_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid, data=clean_gos) go_item.write(self.login_obj) PBB_Core.WDItemEngine.log( 'INFO', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type='', message='success', wd_id=curr_qid, duration=time.time() - start)) print(count, 'success', curr_qid, go_item.get_label(lang='en')) except Exception as e: print(count, 'error', curr_qid) PBB_Core.WDItemEngine.log( 'ERROR', '"{exception_type}", "{message}", {wd_id}, {duration}'. format(exception_type=type(e), message=e.__str__(), wd_id=curr_qid, duration=time.time() - start))
def cleanup_obsolete_edges(ontology_id, core_property_nr, login, current_node_qids=(), obsolete_term=False): filter_props_string = '' if not obsolete_term: for x in OBOImporter.obo_wd_map.values(): prop_nr = list(x.keys())[0] filter_props_string += 'Filter (?p = wdt:{})\n'.format(prop_nr) query = ''' SELECT DISTINCT ?qid ?p ?onto_qid WHERE {{ {{ SELECT DISTINCT ?onto_qid WHERE {{ ?onto_qid wdt:{2} '{0}' . }} }} ?qid ?p [wdt:{2} '{0}']. {1} }} ORDER BY ?qid '''.format(ontology_id, filter_props_string, core_property_nr) print(query) sr = PBB_Core.WDItemEngine.execute_sparql_query(query=query) for occurrence in sr['results']['bindings']: if 'statement' in occurrence['qid']['value']: continue start = time.time() qid = occurrence['qid']['value'].split('/')[-1] if qid in current_node_qids: continue prop_nr = occurrence['p']['value'].split('/')[-1] wd_onto_qid = occurrence['onto_qid']['value'].split('/')[-1] wd_item_id = PBB_Core.WDItemID(value=wd_onto_qid, prop_nr=prop_nr) setattr(wd_item_id, 'remove', '') try: wd_item = PBB_Core.WDItemEngine(wd_item_id=qid, data=[wd_item_id]) wd_item.write(login=login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type='', message='successfully removed obsolete edges', wd_id=qid, duration=time.time() - start)) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type=type(e), message=e.__str__(), wd_id=qid, duration=time.time() - start)) if obsolete_term: data = [ PBB_Core.WDString(value=ontology_id, prop_nr=core_property_nr, rank='deprecated'), ] start = time.time() try: wd_item = PBB_Core.WDItemEngine(item_name='obo', domain='obo', data=data, use_sparql=True) if wd_item.create_new_item: return qid = wd_item.write(login=login) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type='', message='successfully obsoleted the ', wd_id=qid, duration=time.time() - start)) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}'.format(ontology_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start))
def get_item_qid(go_id, data=()): start = time.time() # for efficiency reasons, skip if item already had a root write performed if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \ and 'qid' in self.local_qid_onto_map[go_id]: return self.local_qid_onto_map[go_id]['qid'] try: data = list(data) r = requests.get(url=self.base_url + '{}_{}'.format(self.ontology, go_id), headers=self.headers) go_term_data = r.json() label = go_term_data['label'] description = go_term_data['description'][0] if go_term_data['is_obsolete']: OBOImporter.cleanup_obsolete_edges( ontology_id='{}:{}'.format(self.ontology, go_id), login=self.login_obj, core_property_nr=self.core_property_nr, obsolete_term=True) return None # get parent ontology term info so item can be populated with description, etc. data.append( PBB_Core.WDString(value='GO:{}'.format(go_id), prop_nr=self.core_property_nr, references=[self.create_reference()])) print(data) if go_id in self.local_qid_onto_map: wd_item = PBB_Core.WDItemEngine( wd_item_id=self.local_qid_onto_map[go_id]['qid'], domain='obo', data=data, use_sparql=True) else: wd_item = PBB_Core.WDItemEngine(item_name='test', domain='obo', data=data, use_sparql=True) wd_item.set_label(label=label) if len(description) <= 250: wd_item.set_description(description=description) else: wd_item.set_description(description='Gene Ontology term') if go_term_data['synonyms'] is not None and len( go_term_data['synonyms']) > 0: aliases = [] for alias in go_term_data['synonyms']: if len(alias) <= 250: aliases.append(alias) wd_item.set_aliases(aliases=aliases) new_msg = '' if wd_item.create_new_item: new_msg = ': created new GO term' qid = wd_item.write(login=self.login_obj) if go_id not in self.local_qid_onto_map: self.local_qid_onto_map[go_id] = { 'qid': qid, 'had_root_write': False, } if go_id == current_root_id: self.local_qid_onto_map[go_id]['had_root_write'] = True self.local_qid_onto_map[go_id]['parents'] = list(parents) self.local_qid_onto_map[go_id]['children'] = list(children) current_node_qids.append(qid) print('QID created or retrieved', qid) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type='', message='success{}'.format(new_msg), wd_id=qid, duration=time.time() - start)) return qid except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) return None
def get_item_qid(go_id, data=()): start = time.time() if self.use_prefix: id_string = '{}:{}'.format(self.ontology, go_id) else: id_string = go_id # for efficiency reasons, skip if item already had a root write performed if go_id in self.local_qid_onto_map and self.local_qid_onto_map[go_id]['had_root_write'] \ and 'qid' in self.local_qid_onto_map[go_id]: return self.local_qid_onto_map[go_id]['qid'], False, False try: data = list(data) r = OBOImporter.ols_session.get( url=self.base_url + '{}_{}'.format(self.ontology, go_id), headers=self.headers) go_term_data = r.json() label = go_term_data['label'].replace('_', ' ') description = go_term_data['description'][0] if go_term_data['is_obsolete']: OBOImporter.cleanup_obsolete_edges( ontology_id=id_string, login=self.login_obj, core_property_nr=self.core_property_nr, obsolete_term=True) return None, None, None # get parent ontology term info so item can be populated with description, etc. data.append( PBB_Core.WDString(value=id_string, prop_nr=self.core_property_nr, references=[self.create_reference()])) exact_match_string = 'http://purl.obolibrary.org/obo/{}_{}'.format( self.ontology, go_id) data.append( PBB_Core.WDUrl(value=exact_match_string, prop_nr='P2888')) # add xrefs if go_term_data['obo_xref']: for xref in go_term_data['obo_xref']: if xref['database'] in OBOImporter.xref_props: wd_prop = OBOImporter.xref_props[xref['database']] else: continue xref_value = xref['id'] data.append( PBB_Core.WDExternalID( value=xref_value, prop_nr=wd_prop, references=[self.create_reference()])) if go_term_data['obo_synonym']: for syn in go_term_data['obo_synonym']: if syn['type'] in OBOImporter.obo_synonyms: wd_prop = OBOImporter.obo_synonyms[syn['type']] else: continue syn_value = syn['name'] data.append( PBB_Core.WDExternalID( value=syn_value, prop_nr=wd_prop, references=[self.create_reference()])) if go_id in self.local_qid_onto_map: wd_item = PBB_Core.WDItemEngine( wd_item_id=self.local_qid_onto_map[go_id]['qid'], domain='obo', data=data, fast_run=self.fast_run, fast_run_base_filter=self.fast_run_base_filter) else: wd_item = PBB_Core.WDItemEngine( item_name='test', domain='obo', data=data, fast_run=self.fast_run, fast_run_base_filter=self.fast_run_base_filter) wd_item.set_label(label=label) wd_item.set_description(description=description[0:250]) # if len(description) <= 250: # wd_item.set_description(description=description) # else: # wd_item.set_description(description='Gene Ontology term') if go_term_data['synonyms'] is not None and len( go_term_data['synonyms']) > 0: aliases = [] for alias in go_term_data['synonyms']: if len(alias) <= 250: aliases.append(alias) wd_item.set_aliases(aliases=aliases) new_msg = '' if wd_item.create_new_item: new_msg = ': created new {} term'.format(self.ontology) qid = wd_item.write(login=self.login_obj) if go_id not in self.local_qid_onto_map: self.local_qid_onto_map[go_id] = { 'qid': qid, 'had_root_write': False, } if go_id == current_root_id: self.local_qid_onto_map[go_id]['had_root_write'] = True self.local_qid_onto_map[go_id]['parents'] = list(parents) self.local_qid_onto_map[go_id]['children'] = list(children) current_node_qids.append(qid) print('QID created or retrieved', qid) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type='', message='success{}'.format(new_msg), wd_id=qid, duration=time.time() - start)) return qid, go_term_data['obo_xref'], wd_item.require_write except Exception as e: print(e) # traceback.print_exc(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id='{}:{}'.format(self.ontology, go_id), exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) return None, None, None
def __init__(self, uniprot, base_map, pdb_to_go, go_prop_map, login, progress, fast_run=True): self.uniprot = uniprot self.uniprot_qid = base_map[uniprot]['qid'] self.ensp = set() self.ncbip = set() self.go_terms = set() self.login = login self.go_prop_map = go_prop_map self.entrez = base_map[uniprot]['entrez']['id'] self.entrez_quid = base_map[uniprot]['entrez']['qid'] self.res_id = base_map[uniprot]['entrez']['res_id'] self.label = '' self.description = '' self.aliases = set() self.tax_id = '' self.annotation_type = '' self.statements = [] self.res_prefixes = {x.split(':')[0] for x in res_id_to_entrez_qid} start = time.time() if not os.path.exists('./data/uniprot_raw'): os.makedirs('./data/uniprot_raw') # check if Uniprot xml exists and its age? r = requests.get('http://www.uniprot.org/uniprot/{}.xml'.format(self.uniprot)) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'w') f.write(r.text) f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'r') # check if XML can be properly parsed, log obsolete items for permanent removal. try: for event, e in Et.iterparse(f, events=('start', 'end')): if event == 'end' and e.tag == '{http://uniprot.org/uniprot}entry': if 'dataset' in e.attrib: self.annotation_type = e.attrib['dataset'] if event == 'end' and e.tag == '{http://uniprot.org/uniprot}protein': tmp = e.find('./{http://uniprot.org/uniprot}recommendedName/' '{http://uniprot.org/uniprot}fullName') if tmp is not None: self.label = tmp.text elif e.find('./{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName') is not None: self.label = e.find('./{http://uniprot.org/uniprot}submittedName/' '{http://uniprot.org/uniprot}fullName').text for prop in e.findall('./{http://uniprot.org/uniprot}alternativeName/'): self.aliases.add(prop.text) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}organism': for prop in e.findall('./{http://uniprot.org/uniprot}dbReference'): if prop.attrib['type'] == 'NCBI Taxonomy': self.tax_id = prop.attrib['id'] # print(e) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'Ensembl': for prop in e.findall('./{http://uniprot.org/uniprot}property'): if prop.attrib['type'] == 'protein sequence ID': self.ncbip.add(prop.attrib['value']) self.statements.append(PBB_Core.WDString(value=prop.attrib['value'], prop_nr='P705', references=[self.create_reference()])) if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] == 'RefSeq': self.ncbip.add(e.attrib['id']) self.statements.append(PBB_Core.WDString(value=e.attrib['id'], prop_nr='P637', references=[self.create_reference()])) # get alternative identifiers for gene to protein mapping if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \ and 'type' in e.attrib and e.attrib['type'] in self.res_prefixes: res_id = e.attrib['id'] if res_id in res_id_to_entrez_qid: self.entrez_quid = res_id_to_entrez_qid[res_id][0] except Et.ParseError as e: print('Error when parsing Uniprot {} XML file, item {} most likely obsolete'.format(self.uniprot, self.uniprot_qid)) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start )) return # get GO annotations from QuickGO params = { 'format': 'tsv', 'limit': '1000', 'protein': self.uniprot } url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation' try: itrt = iter(requests.get(url, params=params).text.strip('\n ').split('\n')) next(itrt) # skip header line for line in itrt: cols = line.split('\t') go_id = cols[6] evidence_code = cols[9] go_aspect = cols[11][0] if self.uniprot not in pdb_to_go: pdb_to_go[self.uniprot] = { 'go_terms': list(), 'evidence': list(), 'pdb': set() } pdb_to_go[self.uniprot]['go_terms'].append(go_id) pdb_to_go[self.uniprot]['evidence'].append(evidence_code) if go_id in go_prop_map: go_prop_map[go_id]['go_class_prop'] = ProteinBot.get_go_class(go_id, go_aspect) except requests.HTTPError as e: print(e.__str__()) print('Quick GO service not available, exiting!') sys.exit(1) except IndexError: print(e.__str__()) print('Quick GO data error, service likely not available, exiting!') sys.exit(1) # set description according to the annotation the Uniprot entry is coming from self.description = self.descr_map[self.tax_id]['en'] if self.annotation_type == 'TrEMBL': self.description += ' (annotated by UniProtKB/TrEMBL {})'.format(self.uniprot) elif self.annotation_type == 'Swiss-Prot': self.description += ' (annotated by UniProtKB/Swiss-Prot {})'.format(self.uniprot) # assign a GO term a GO subontology/OBO namespace if self.uniprot in pdb_to_go: for go in set(pdb_to_go[self.uniprot]['go_terms']): # check if a GO term is not yet in Wikidata # TODO: If a GO term is not in Wikidata, trigger OBO bot to add it if go not in go_prop_map: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type='GO term not in Wikidata exception', message='GO term {} not found in Wikidata, skipping this one'.format(go), wd_id=self.uniprot_qid, duration=time.time() - start )) print('GO term {} not found in Wikidata, skipping this one'.format(go)) continue # search in the EBI OBO Lookup Service, for the rare case a GO term has not been assigned its class if not go_prop_map[go]['go_class_prop']: go_class_prop = ProteinBot.get_go_class(go) if not go_class_prop: continue go_prop_map[go]['go_class_prop'] = go_class_prop print('added class code {} to {}'.format(go_prop_map[go]['go_class_prop'], go)) # create a set of WD QIDs representing GO evidence code items in WD evidence = list() for count, ev in enumerate(pdb_to_go[self.uniprot]['evidence']): if pdb_to_go[self.uniprot]['go_terms'][count] == go and self.go_evidence_codes[ev] not in evidence: evidence.append(self.go_evidence_codes[ev]) # iterate though the evidence code set and create a new qualifier for each one qualifiers = [PBB_Core.WDItemID(value=ev, prop_nr='P459', is_qualifier=True) for ev in evidence if ev] # Create Wikidata GO term value prop_nr = self.go_prop_map[go]['go_class_prop'] qid = self.go_prop_map[go]['qid'] self.statements.append(PBB_Core.WDItemID(value=qid, prop_nr=prop_nr, qualifiers=qualifiers, references=[self.create_reference()])) for pdb in pdb_to_go[self.uniprot]['pdb']: self.statements.append(PBB_Core.WDString(value=pdb.upper(), prop_nr='P638', references=[self.create_reference()])) self.statements.append(PBB_Core.WDItemID(value='Q8054', prop_nr='P279', references=[self.create_reference()])) if self.entrez_quid != '': self.statements.append(PBB_Core.WDItemID(value=self.entrez_quid, prop_nr='P702', references=[self.create_reference()])) current_taxonomy_id = self.taxon_map[self.tax_id] self.statements.append(PBB_Core.WDItemID(value=current_taxonomy_id, prop_nr='P703', references=[self.create_reference()])) self.statements.append(PBB_Core.WDString(value=self.uniprot, prop_nr='P352', references=[self.create_reference()])) # remove all Wikidata properties where no data has been provided, but are handled by the bot all_stmnt_props = list(map(lambda x: x.get_prop_nr(), self.statements)) for pr in ['P680', 'P681', 'P682', 'P705', 'P637', 'P638', 'P692', 'P702']: if pr not in all_stmnt_props: self.statements.append(PBB_Core.WDBaseDataType.delete_statement(prop_nr=pr)) try: taxon_qid = self.taxon_map[self.tax_id] new_msg = '' if self.uniprot_qid: wd_item = PBB_Core.WDItemEngine(wd_item_id=self.uniprot_qid, domain='proteins', data=self.statements, append_value=['P279'], fast_run=fast_run, fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q8054'}) else: wd_item = PBB_Core.WDItemEngine(item_name=self.label, domain='proteins', data=self.statements) new_msg = 'new protein created' wd_item.set_label(self.label) wd_item.set_description(self.description) wd_item.set_aliases(aliases=self.aliases, append=False) self.uniprot_qid = wd_item.write(self.login) if self.entrez_quid != '': encodes = PBB_Core.WDItemID(value=self.uniprot_qid, prop_nr='P688', references=[self.create_reference()]) gene_item = PBB_Core.WDItemEngine(wd_item_id=self.entrez_quid, data=[encodes], append_value=['P688'], fast_run=fast_run, fast_run_base_filter={'P703': taxon_qid, 'P279': 'Q7187'}) gene_item.write(login) progress[self.uniprot] = self.uniprot_qid PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type='', message='success{}'.format(new_msg), wd_id=self.uniprot_qid, duration=time.time() - start )) # pprint.pprint(wd_item.get_wd_json_representation()) except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id='{}'.format(self.uniprot), exception_type=type(e), message=e.__str__(), wd_id=self.uniprot_qid, duration=time.time() - start )) traceback.print_exc() print(self.label) print(self.aliases) print(self.tax_id)
def __init__(self, login): self.login_obj = login image_data = pd.read_csv( './image_data/gene_wiki_images_with_preferred.txt', encoding='utf-8', sep='\t', dtype={'entrez': np.str}) wdq_results = PBB_Core.WDItemList('CLAIM[351] and CLAIM[703:5]', '351').wditems wd_entrez_ids = list(map(lambda z: z[2], wdq_results['props']['351'])) entrez_qid_list = list( map(lambda z: 'Q{}'.format(z[0]), wdq_results['props']['351'])) print(len(wd_entrez_ids)) for index in image_data.index: start = time.time() # print(image_data.loc[index, 'other_images']) image_names = image_data.loc[index, 'other_images'] preferred_image = image_data.loc[index, 'primary_image'] image_file_extension = ['.png', '.jpg', '.jpeg', '.pdf'] if pd.notnull(preferred_image) and '|' in preferred_image: for splt in preferred_image.split('|'): for ending in image_file_extension: if ending in splt: preferred_image = splt break entrez = image_data.loc[index, 'entrez'] # print(entrez) protein_images = [] protein_image_value_store = [] genex_images = [] genex_value_store = [] if entrez not in wd_entrez_ids: PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='Entrez ID not yet in Wikidata!!', wd_id='', duration=time.time() - start)) continue else: curr_qid = entrez_qid_list[wd_entrez_ids.index(entrez)] if pd.isnull(image_names): PBB_Core.WDItemEngine.log( 'WARNING', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='No images available for this Entrez ID', wd_id=curr_qid, duration=time.time() - start)) continue for sub_string in image_names.split('|'): if 'PBB GE ' in sub_string: value = sub_string[5:] # if value[-6:-4] == 'tn': # value = value[:-6] + 'fs' + value[-4:] # Gene Expression reference: https://www.wikidata.org/wiki/Q21074956 genex_images.append(value) genex_value_store.append( PBB_Core.WDCommonsMedia(value=value, prop_nr='P692')) elif 'PDB ' in sub_string: value = sub_string[5:] protein_images.append(value) protein_image_value_store.append( PBB_Core.WDCommonsMedia(value, prop_nr='')) entrez_id_value = PBB_Core.WDString(value=entrez, prop_nr='P351') data = [entrez_id_value] data.extend(genex_value_store) if pd.notnull(preferred_image): data.append( PBB_Core.WDCommonsMedia(value=preferred_image, prop_nr='P18')) try: gene_item = PBB_Core.WDItemEngine(wd_item_id=curr_qid, domain='genes', data=data) # pprint.pprint(gene_item.get_wd_json_representation()) gene_item.write(self.login_obj) PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type='', message='success', wd_id=curr_qid, duration=time.time() - start)) print(index, 'success', curr_qid, entrez, gene_item.get_label(lang='en')) except Exception as e: print(index, 'error', curr_qid, entrez) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=entrez, exception_type=type(e), message=e.__str__(), wd_id=curr_qid, duration=time.time() - start))
def __init__(self, user, pwd): properties = [ 'P279', 'P769', 'P31', 'P636', 'P267', 'P231', 'P486', 'P672', 'P662', 'P661', 'P652', 'P665', 'P683', 'P274', 'P715', 'P646', 'P592', 'P233', 'P234', 'P235', 'P18', 'P373', 'P2275', 'P657', 'P595', 'P2115' ] # these property names do not match those in Wikidata!! property_names = [ 'subclass of', 'significant drug interaction', 'instance of', 'route of administration', 'ATC code', 'CAS number', 'MeSH ID', 'MeSH Code', 'PubChem ID (CID)', 'ChemSpider', 'UNII', 'KEGG Drug', 'ChEBI', 'Molecular Formula', 'Drugbank ID', 'Freebase identifier', 'ChEMBL', 'SMILES', 'InChI', 'InChIKey', 'image', 'Commons category', 'WHO INN', 'RTECS Number', 'Guide to Pharmacology', 'NDF-RT NUI' ] prop_to_name = dict(zip(properties, property_names)) name_to_prop = dict(zip(property_names, properties)) login_obj = WDLogin(user=user, pwd=pwd, server='www.wikidata.org') drug_data = pd.read_csv('./drugbank_data/drugbank.csv', index_col=0, engine='c', encoding='utf-8', dtype={ 'PubChem ID (CID)': np.str, 'ChEBI': np.str, 'ChEMBL': np.str, 'ChemSpider': np.str, 'Guide to Pharmacology': np.str }) # extract creation date of Drugbank file from Drugbank zip file drugbank_zip = zipfile.ZipFile('./drugbank_data/drugbank.xml.zip') self.drugbank_date = datetime.datetime( *[x for x in drugbank_zip.infolist()[0].date_time]).strftime( '+%Y-%m-%dT00:00:00Z') print(drug_data.dtypes) base_ref = {'ref_properties': ['P248'], 'ref_values': ['Q1122544']} # remove potential 'InChI=' and 'InChIKey=' prefixes for i in drug_data['InChI'].index: if pd.notnull(drug_data['InChI'].at[i]): if 'InChI=' in drug_data['InChI'].at[i]: drug_data['InChI'].at[i] = drug_data['InChI'].at[i][6:] if 'InChIKey=' in drug_data['InChIKey'].at[i]: drug_data['InChIKey'].at[i] = drug_data['InChIKey'].at[i][ 9:] # remove DB prefix from Drugbank ID (should be corrected in the Wikidata property) for i in drug_data['Drugbank ID'].index: if pd.notnull(drug_data['Drugbank ID'].at[i]): drug_data['Drugbank ID'].at[i] = drug_data['Drugbank ID'].at[ i][2:] # Iterate though all drugbank compounds and add those to Wikidata which are either FDA-approved or have been # withdrawn from the market. Add all non-missing values for each drug to Wikidata. for count in drug_data.index: print('Count is:', count) if drug_data.loc[count, 'Status'] == 'approved' or drug_data.loc[ count, 'Status'] == 'withdrawn': data = [] special_cases = ['WHO INN', 'ATC code'] for col in drug_data.columns.values: data_value = drug_data.loc[count, col] # no values and values greater than 400 chars should not be added to wikidata. if pd.isnull(data_value) or col not in name_to_prop: continue elif len(data_value) > 400: continue if col in property_names and col not in special_cases: data.append( PBB_Core.WDString(value=str(data_value).strip(), prop_nr=name_to_prop[col])) # add instances of (P31) of chemical compound (Q11173), pharmaceutical drug (Q12140), # Biologic medical product (Q679692) and monoclonal antibodies (Q422248) data.append(PBB_Core.WDItemID(value='Q11173', prop_nr='P31')) data.append(PBB_Core.WDItemID(value='Q12140', prop_nr='P31')) if drug_data.loc[count, 'Drug type'] == 'biotech': data.append( PBB_Core.WDItemID(value='Q679692', prop_nr='P31')) if drug_data.loc[count, 'Name'][-3:] == 'mab': data.append( PBB_Core.WDItemID(value='Q422248', prop_nr='P31')) # for instance of, do not overwrite what other users have put there append_value = ['P31', 'P2275'] # Monolingual value WHO INN requires special treatment if pd.notnull(drug_data.loc[count, 'WHO INN']): data.append( PBB_Core.WDMonolingualText( value=drug_data.loc[count, 'WHO INN'], prop_nr='P2275', language='en')) # split the ATC code values present as one string in the csv file if pd.notnull(drug_data.loc[count, 'ATC code']): for atc in drug_data.loc[count, 'ATC code'].split(';'): data.append( PBB_Core.WDString(value=atc, prop_nr='P267')) drugbank_source = [ 'instance of', 'ATC code', 'CAS number', 'Drugbank ID', 'Molecular Formula', 'InChI', 'InChIKey' ] chembl_source = [ 'ChEMBL', 'ChemSpider', 'KEGG Drug', 'ChEBI', 'SMILES', 'WHO INN', 'Guide to Pharmacology' ] pubchem_source = ['MeSH ID', 'PubChem ID (CID)'] ndfrt_source = ['NDF-RT NUI', 'UNII'] for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in chembl_source ]: # if no ChEMBL ID exists, data is from Drugbank, therefore add Drugbank as ref if pd.isnull(drug_data.loc[count, 'ChEMBL']): drugbank_source.append( prop_to_name[i.get_prop_nr()]) continue i.set_references( self.make_reference( stated_in='Q6120337', source_element=drug_data.loc[count, 'ChEMBL'], source_element_name=drug_data.loc[count, 'Name'], source_element_prop=name_to_prop['ChEMBL'])) for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in drugbank_source ]: i.set_references( self.make_reference( stated_in='Q1122544', source_element=drug_data.loc[count, 'Drugbank ID'], source_element_name=drug_data.loc[count, 'Name'], source_element_prop=name_to_prop[ 'Drugbank ID'], date=self.drugbank_date, date_property='P577')) for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in pubchem_source ] and pd.notnull(drug_data.loc[count, 'PubChem ID (CID)']): i.set_references( self.make_reference( stated_in='Q278487', source_element=drug_data.loc[ count, 'PubChem ID (CID)'], source_element_name=drug_data.loc[count, 'Name'], source_element_prop=name_to_prop[ 'PubChem ID (CID)'])) for i in data: if i.get_prop_nr() in [ name_to_prop[x] for x in ndfrt_source ] and pd.notnull(drug_data.loc[count, 'NDF-RT NUI']): i.set_references( self.make_reference( stated_in='Q21008030', source_element=drug_data.loc[count, 'NDF-RT NUI'], source_element_name=drug_data.loc[ count, 'Name'].upper(), source_element_prop=name_to_prop['NDF-RT NUI']) ) label = drug_data.loc[count, 'Name'] domain = 'drugs' # If label in aliases list, remove the label from it. If an alias is longer than 250 chars, also remove # Aliases longer than 250 characters will trigger an WD API error. if pd.notnull(drug_data.loc[count, 'Aliases']): aliases = drug_data.loc[count, 'Aliases'].split(';') for i in aliases: if i == label or i == label.lower( ) or len(i) > 250 or len(i) == 0: aliases.remove(i) start = time.time() # pprint.pprint(data) # pprint.pprint(references) print('Drug name:', label) try: wd_item = PBB_Core.WDItemEngine(item_name=label, domain=domain, data=data, use_sparql=True, append_value=append_value) # overwrite only certain descriptions descriptions_to_overwrite = { 'chemical compound', 'chemical substance', '' } if wd_item.get_description() in descriptions_to_overwrite: wd_item.set_description( description='pharmaceutical drug', lang='en') wd_item.set_label(label=label, lang='en') if pd.notnull(drug_data.loc[count, 'Aliases']): wd_item.set_aliases(aliases=aliases, lang='en', append=True) # pprint.pprint(wd_item.get_wd_json_representation()) wd_item.write(login_obj) new_mgs = '' if wd_item.create_new_item: new_mgs = ': New item' PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id=drug_data['Drugbank ID'].at[count], exception_type='', message='success{}'.format(new_mgs), wd_id=wd_item.wd_item_id, duration=time.time() - start)) print('success') except Exception as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format( main_data_id=drug_data['Drugbank ID'].at[count], exception_type=type(e), message=e.__str__(), wd_id='', duration=time.time() - start)) end = time.time() print('Time elapsed:', end - start)