def main(df, log_dir="./logs", fast_run=False): df = filter_df_clinical_missense(df) # df = df.head(2) login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) # make sure we have all the variant items we need hgvs_qid = id_mapper(PROPS['HGVS nomenclature']) for _, row in tqdm(df.iterrows(), total=len(df)): if row.gDNA not in hgvs_qid: continue label = "{} ({})".format(row.gDNA, row['individual_mutation']) print("creating {}".format(label)) try: item = create_missense_variant_item(row.gDNA, label, login, fast_run=fast_run) except Exception as e: print(e) wdi_core.WDItemEngine.log( "ERROR", wdi_helpers.format_msg(row.gDNA, "gDNA", None, str(e), type(e))) continue hgvs_qid[row.gDNA] = item.wd_item_id for _, row in tqdm(df.iterrows(), total=len(df)): if row.gDNA not in hgvs_qid: wdi_core.WDItemEngine.log( "WARNING", wdi_helpers.format_msg( row.gDNA, "gDNA", None, "variant not found: {}".format(row.gDNA), "variant not found")) continue if row.Association not in association_map: wdi_core.WDItemEngine.log( "WARNING", wdi_helpers.format_msg( row.gDNA, "gDNA", None, "Association not found: {}".format(row.Association), "association not found")) continue qid = hgvs_qid[row.gDNA] association = association_map[row.Association] drug_qid = row.Drug_qid prim_tt_qid = row.prim_tt_qid source = row.Source evidence_level = row['Evidence level'] item = create_variant_annotation(qid, association, drug_qid, prim_tt_qid, source, evidence_level, login)
def create(self, write=True): if self.deprecated: msg = wdi_helpers.format_msg(self.doid, 'P699', None, "delete me", msg_type="delete me") wdi_core.WDItemEngine.log("WARNING", msg) print(msg) return None try: self.create_xref_statements() self.s.extend(self.s_xref) self.create_main_statements() self.s.extend(self.s_main) wd_item = wdi_core.WDItemEngine(data=self.s, append_value=[PROPS['subclass of'], PROPS['instance of'], PROPS['has cause'], PROPS['location'], PROPS['OMIM ID'], PROPS['Orphanet ID'], PROPS['MeSH ID'], PROPS['ICD-10-CM'], PROPS['ICD-10'], PROPS['ICD-9-CM'], PROPS['ICD-9'], PROPS['NCI Thesaurus ID'], PROPS['UMLS CUI'] ], fast_run=self.do_graph.fast_run, fast_run_base_filter={'P699': ''}, fast_run_use_refs=True, global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new ) wd_item.fast_run_container.debug = False if wd_item.get_label(lang="en") == "": wd_item.set_label(self.lbl, lang="en") current_descr = wd_item.get_description(lang='en') if current_descr == self.definition and self.definition and len(self.definition) < 250: # change current def to cleaned def wd_item.set_description(utils.clean_description(self.definition)) elif current_descr.lower() in {"", "human disease", "disease"} and self.definition and len( self.definition) < 250: wd_item.set_description(utils.clean_description(self.definition)) elif current_descr.lower() == "": wd_item.set_description(description="human disease", lang='en') if self.synonyms is not None: wd_item.set_aliases(aliases=self.synonyms, lang='en', append=True) if self.wikilink is not None: # a lot of these are not right... don't do this # wd_item.set_sitelink(site="enwiki", title=self.wikilink) pass wdi_helpers.try_write(wd_item, record_id=self.doid, record_prop='P699', login=self.do_graph.login, write=write) return wd_item except Exception as e: exc_info = sys.exc_info() print(self.doid) traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.doid, 'P699', None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def filter_df_clinical_missense(df): # Keep those with clinical evidence only clinical_evidence = { 'FDA guidelines', 'European LeukemiaNet guidelines', 'NCCN guidelines', 'CPIC guidelines', 'NCCN/CAP guidelines' } df = df[df['Evidence level'].isin(clinical_evidence)] # MUT only, with a HGVS ID drop = df[df.gDNA.isnull()] df = df[df.gDNA.notnull()] for _, row in drop.iterrows(): wdi_core.WDItemEngine.log( "WARNING", wdi_helpers.format_msg(row.Alteration, "alteration", None, '', "no HGVS ID")) # get rid of those where we don't know the drug drop = df[df.Drug_qid.isnull()] df = df[df.Drug_qid.notnull()] for _, row in drop.iterrows(): wdi_core.WDItemEngine.log( "WARNING", wdi_helpers.format_msg(row.Alteration, "alteration", None, "unknown drug: {}".format(row.Drug), "unknown drug")) # get rid of the multiple drug ("or") items drop = df[df.Drug_qid.str.count(";") != 0] df = df[df.Drug_qid.str.count(";") == 0] for _, row in drop.iterrows(): wdi_core.WDItemEngine.log( "WARNING", wdi_helpers.format_msg(row.Alteration, "alteration", None, "unknown drug: {}".format(row.Drug), "unknown drug")) # get rid of those where we don't know the disease drop = df[df.prim_tt_qid.isnull()] df = df[df.prim_tt_qid.notnull()] for _, row in drop.iterrows(): wdi_core.WDItemEngine.log( "WARNING", wdi_helpers.format_msg( row.Alteration, "alteration", None, "unknown disease: {}".format(row['Primary Tumor type']), "unknown disease")) return df
def make_gene_encodes(self, write=True): """ Add an "encodes" statement to the gene item :return: """ uniprot_ref = make_ref_source(self.record['uniprot']['@source'], PROPS['UniProt ID'], self.external_ids['UniProt ID'], login=self.login) try: statements = [ wdi_core.WDItemID(self.protein_wdid, PROPS['encodes'], references=[uniprot_ref]) ] wd_item_gene = wdi_core.WDItemEngine( wd_item_id=self.gene_wdid, domain='genes', data=statements, append_value=[PROPS['encodes']]) wdi_helpers.try_write(wd_item_gene, self.external_ids['UniProt ID'], PROPS['UniProt ID'], self.login, write=write) except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.external_ids['UniProt ID'], PROPS['UniProt ID'], None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def create_relationships(self, login, write=True): try: # endpoint may not get updated in time? self.do_wdid_lookup() except KeyError as e: wdi_core.WDItemEngine.log("ERROR", format_msg(self.id, INTERPRO, None, str(e), type(e))) return statements = [wdi_core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference])] if self.parent: # subclass of statements.append(wdi_core.WDItemID(value=self.parent_wdid, prop_nr='P279', references=[self.reference])) if self.contains: for c in self.contains_wdid: statements.append(wdi_core.WDItemID(value=c, prop_nr='P527', references=[self.reference])) # has part if self.found_in: for f in self.found_in_wdid: statements.append(wdi_core.WDItemID(value=f, prop_nr='P361', references=[self.reference])) # part of if len(statements) == 1: return wd_item = wdi_core.WDItemEngine(wd_item_id=self.wdid, domain='interpro', data=statements, append_value=['P279', 'P527', 'P361'], fast_run=True, fast_run_base_filter=IPRTerm.fast_run_base_filter) wdi_helpers.try_write(wd_item, self.id, INTERPRO, login, edit_summary="create/update subclass/has part/part of", write=write)
def run_one(self, record, gene_wdid, write): protein = Protein(record, self.organism_info, gene_wdid, self.login) try: protein.parse_external_ids() uniprot = protein.external_ids['UniProt ID'] except Exception as e: msg = wdi_helpers.format_msg(gene_wdid, None, None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg) return # some proteins are encoded by multiple genes. don't try to create it again if uniprot in self.uniprot_qid: qid = self.uniprot_qid[uniprot] wditem = protein.update_item(qid, fast_run=fast_run, write=write) else: wditem = protein.create_item(fast_run=fast_run, write=write) if wditem is not None: self.uniprot_qid[uniprot] = wditem.wd_item_id protein.make_gene_encodes(write=write) if protein.status is not True: self.failed.append(protein.uniprot)
def create_depend(self, login=None, write=True): if self.deprecated: return None if not self.wd_item_id: print("must create item first: {}".format(node.id_purl)) return None try: s = self.create_main_statements() wd_item = wdi_core.WDItemEngine( wd_item_id=self.wd_item_id, data=s, domain=self.domain, append_value=[PROPS['subclass of'], PROPS['instance of']], fast_run=self.fast_run, fast_run_base_filter={self.primary_ext_prop_qid: ''}) wdi_helpers.try_write(wd_item, record_id=self.id_colon, record_prop=self.primary_ext_prop_qid, login=login, write=write) return wd_item except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.id_colon, self.primary_ext_prop_qid, None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def make_statement_from_edge(self, edge): # custom statement creator for regulates h = self.helper if edge['pred'] in { 'http://purl.obolibrary.org/obo/RO_0002212', 'http://purl.obolibrary.org/obo/RO_0002213' }: subj_node = self.uri_node_map[edge['sub']] obj_qid = self.get_object_qid(edge['obj']) # print(obj_qid, edge['pred']) qual_qid = self.uri_node_map[self.regulates[edge['pred']]].qid pred_pid = self.PRED_PID_MAP[ 'http://purl.obolibrary.org/obo/RO_0002211'] if not (obj_qid and qual_qid and pred_pid): m = wdi_helpers.format_msg(edge['sub'], None, None, "failed on edge: {}".format(edge)) print(m) wdi_core.WDItemEngine.log("WARNING", m) return None qualifier = wdi_core.WDItemID(qual_qid, h.get_pid(PROPS['subject has role']), is_qualifier=True) return wdi_core.WDItemID( obj_qid, pred_pid, qualifiers=[qualifier], references=[subj_node.create_ref_statement()]) else: return super(GOGraph, self).make_statement_from_edge(edge)
def create_main_statements(self): if not self.reference: self.create_reference() self.s_main = [] for relationship in self.relationships: if relationship[0] not in self.do_graph.edge_prop: # s = "unknown relationship: {}".format(relationship[0]) # msg = wdi_helpers.format_msg(self.doid, 'P699', None, s, msg_type="unknown relationship") # wdi_core.WDItemEngine.log("WARNING", msg) continue if relationship[1] not in self.do_graph.purl_wdid: s = "unknown obj: {}".format(relationship[1]) msg = wdi_helpers.format_msg(self.doid, 'P699', None, s, msg_type="unknown obj") wdi_core.WDItemEngine.log("WARNING", msg) continue self.s_main.append(wdi_core.WDItemID(self.do_graph.purl_wdid[relationship[1]], self.do_graph.edge_prop[relationship[0]], references=[self.reference])) # add http://purl.obolibrary.org/obo/, exact match self.s_main.append(wdi_core.WDString(self.id, PROPS['exact match'], references=[self.reference])) if self.doid != "DOID:4": # instance of disease self.s_main.append(wdi_core.WDItemID('Q12136', PROPS['instance of'], references=[self.reference])) miriam_ref = [wdi_core.WDItemID(value="Q16335166", prop_nr='P248', is_reference=True), wdi_core.WDUrl("http://www.ebi.ac.uk/miriam/main/collections/MIR:00000233", 'P854', is_reference=True)] self.s_main.append(wdi_core.WDString("http://identifiers.org/doid/{}".format(self.doid), PROPS['exact match'], references=[miriam_ref]))
def create_item(self, login=None, fast_run=True, write=True): # if no login given, write will not be attempted statements = [wdi_core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]), wdi_core.WDItemID(value=self.type_wdid, prop_nr=INSTANCE_OF, references=[self.reference])] try: wd_item = wdi_core.WDItemEngine(item_name=self.name, domain='interpro', data=statements, append_value=["P279", "P31"], fast_run=fast_run, fast_run_base_filter=IPRTerm.fast_run_base_filter) except JSONDecodeError as e: wdi_core.WDItemEngine.log("ERROR", wdi_helpers.format_msg(self.id, INTERPRO, None, str(e), msg_type=type(e))) return None wd_item.set_label(self.name, lang='en') for lang, description in self.lang_descr.items(): if wd_item.get_description(lang=lang) == "": wd_item.set_description(description, lang=lang) wd_item.set_aliases([self.short_name, self.id]) if login: wdi_helpers.try_write(wd_item, self.id, INTERPRO, login, write=write) return wd_item
def update_item(self, qid, fast_run=True, write=True): print("updating protein: {}".format(qid)) try: self.parse_external_ids() self.statements = self.create_statements() wd_item_protein = wdi_core.WDItemEngine(wd_item_id=qid, data=self.statements, append_value=[PROPS['instance of'], PROPS['encoded by'], PROPS['Ensembl Protein ID'], PROPS['RefSeq Protein ID']], fast_run=fast_run, fast_run_base_filter={PROPS['UniProt ID']: '', PROPS['found in taxon']: self.organism_info[ 'wdid']}, fast_run_use_refs=True, ref_handler=update_retrieved_if_new, global_ref_mode="CUSTOM", core_props=core_props) wdi_helpers.try_write(wd_item_protein, self.external_ids['UniProt ID'], PROPS['UniProt ID'], self.login, write=write) self.protein_wdid = wd_item_protein.wd_item_id return wd_item_protein except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.external_ids['Entrez Gene ID'], PROPS['Entrez Gene ID'], None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg) return None
def make_gene_encodes(self, write=True): """ Add an "encodes" statement to the gene item :return: """ uniprot_ref = make_ref_source(self.record['uniprot']['@source'], PROPS['UniProt ID'], self.external_ids['UniProt ID'], login=self.login) try: statements = [wdi_core.WDItemID(self.protein_wdid, PROPS['encodes'], references=[uniprot_ref])] wd_item_gene = wdi_core.WDItemEngine(wd_item_id=self.gene_wdid,data=statements, append_value=[PROPS['encodes']], fast_run=fast_run, fast_run_base_filter={PROPS['Entrez Gene ID']: '', PROPS['found in taxon']: self.organism_info[ 'wdid']}, global_ref_mode="CUSTOM", ref_handler=update_retrieved_if_new, core_props=core_props) wdi_helpers.try_write(wd_item_gene, self.external_ids['UniProt ID'], PROPS['UniProt ID'], self.login, write=write) except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.external_ids['UniProt ID'], PROPS['UniProt ID'], None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def run_one(taxid, genbank_id): # get the QID taxid = str(taxid) if taxid not in tax_qid_map: msg = wdi_helpers.format_msg( genbank_id, PROPS['GenBank Assembly accession'], "", "organism with taxid {} not found or skipped".format(taxid)) wdi_core.WDItemEngine.log("WARNING", msg) return None qid = tax_qid_map[taxid] reference = create_reference(genbank_id) genbank_statement = wdi_core.WDExternalID( genbank_id, PROPS['GenBank Assembly accession'], references=[reference]) # create the item object, specifying the qid item = wdi_core.WDItemEngine( data=[genbank_statement], wd_item_id=qid, fast_run=True, fast_run_base_filter={PROPS['GenBank Assembly accession']: ''}, global_ref_mode='CUSTOM', fast_run_use_refs=True, ref_handler=update_retrieved_if_new) wdi_helpers.try_write(item, record_id=genbank_id, record_prop=PROPS['GenBank Assembly accession'], login=login, edit_summary="update GenBank Assembly accession")
def get_object_qid(self, edge_obj): # object in an edge could be anything. it doesn't have to be a URI that exists within this graph # for example, we could be running the DO, and it have an object that is an UBERON class # first. check if this URI exists in our graph if edge_obj in self.uri_node_map: return self.uri_node_map[edge_obj].qid # if not, check if the prefix exists in wikidata try: obj_pid, obj_value = cu.parse_curie(cu.uri_to_curie(edge_obj)) except Exception as e: m = wdi_helpers.format_msg( None, None, None, "edge object not found: {}".format(edge_obj)) print(m) wdi_core.WDItemEngine.log("WARNING", m) return None obj_pid = self.helper.get_pid(obj_pid) # if this property exists, get all of the values for this property if obj_pid not in self.pid_id_mapper: print("loading: {}".format(obj_pid)) id_map = wdi_helpers.id_mapper(obj_pid, return_as_set=True, prefer_exact_match=True, endpoint=self.sparql_endpoint_url) self.pid_id_mapper[obj_pid] = id_map if id_map else dict() # look up by the value if obj_value in self.pid_id_mapper[obj_pid]: obj_qids = self.pid_id_mapper[obj_pid][obj_value] if len(obj_qids) == 1: return list(obj_qids)[0] else: m = wdi_helpers.format_msg( None, None, None, "multiple qids ({}) found for: {}".format( obj_qids, edge_obj)) print(m) wdi_core.WDItemEngine.log("WARNING", m) else: m = wdi_helpers.format_msg( None, None, None, "no qids found for: {}".format(edge_obj)) print(m) wdi_core.WDItemEngine.log("WARNING", m)
def parse_nodes(self, nodes): for node in nodes: try: tmp_node = DONode(node, self) if "http://purl.obolibrary.org/obo/DOID_" in tmp_node.id and not tmp_node.deprecated and tmp_node.type == "CLASS": self.nodes[tmp_node.id] = tmp_node except Exception as e: msg = wdi_helpers.format_msg(node['id'], 'P699', None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def run_shex_manifest(): print(os.environ["SHEX_MANIFEST"]) manifest = jsonasobj.loads(requests.get(os.environ["SHEX_MANIFEST"]).text) for case in manifest: if case.data.startswith("Endpoint:"): sparql_endpoint = case.data.replace("Endpoint: ", "") schema = requests.get(case.schemaURL).text shex = ShExC(schema).schema evaluator = ShExEvaluator(schema=shex, debug=True) sparql_query = case.queryMap.replace("SPARQL '''", "").replace("'''@START", "") df = wdi_core.WDItemEngine.execute_sparql_query(sparql_query) for row in df["results"]["bindings"]: wdid = row["item"]["value"] slurpeddata = SlurpyGraph(sparql_endpoint) try: if os.environ["debug"] == "True": debug = True elif os.environ["debug"] == "False": debug = False results = evaluator.evaluate(rdf=slurpeddata, focus=wdid, debug=debug) for result in results: if result.result: print(str(result.focus) + ": INFO") msg = wdi_helpers.format_msg( wdid, wdid, None, 'CONFORMS', '') wdi_core.WDItemEngine.log("INFO", msg) else: msg = wdi_helpers.format_msg( wdid, wdid, None, '', result.reason) wdi_core.WDItemEngine.log("ERROR", msg) except RuntimeError: print( "Continue after 1 minute, no validation happened on" + wdid) continue
def create_edges(self, login, write=True): # skip edges where the subject is not one of our nodes all_uris = set(node.id_uri for node in self.nodes) skipped_edges = [e for e in self.edges if e['sub'] not in all_uris] print("skipping {} edges where the subject is a node that is being skipped".format(len(skipped_edges))) for node in tqdm(self.nodes, desc="creating edges"): if not node.qid: m = wdi_helpers.format_msg(node.id_curie, node.id_pid, None, "QID not found, skipping edges") print(m) wdi_core.WDItemEngine.log("WARNING", m) continue this_uri = node.id_uri this_edges = [edge for edge in self.edges if edge['sub'] == this_uri] ss = [] for edge in this_edges: s = self.make_statement_from_edge(edge) if s and s.get_value(): ss.append(s) # set instance of using the root node root_nodes = self.root_node[node.id_uri] for root_node in root_nodes: # don't add instance of self! if root_node in self.uri_node_map and root_node != node.id_uri: # print("{} root node {}".format(node.id_uri, root_node)) ref = node.create_ref_statement() value_qid = self.uri_node_map[root_node].qid if value_qid: ss.append(wdi_core.WDItemID(value_qid, self.helper.get_pid('P31'), references=[ref])) if not ss: # there are no statements for this node continue # print("{}".format([(x.get_value(), x.get_prop_nr()) for x in ss])) item = wdi_core.WDItemEngine( wd_item_id=node.qid, data=ss, append_value=self.APPEND_PROPS, fast_run=self.FAST_RUN, fast_run_base_filter={node.id_pid: ''}, fast_run_use_refs=True, global_ref_mode='CUSTOM', ref_handler=self.ref_handler, sparql_endpoint_url=self.sparql_endpoint_url, mediawiki_api_url=self.mediawiki_api_url, core_props=self.CORE_IDS ) this_pid, this_value = cu.parse_curie(cu.uri_to_curie(this_uri)) this_pid = self.helper.get_pid(this_pid) wdi_helpers.try_write(item, record_id=this_value, record_prop=this_pid, login=login, write=write)
def create_xref_statement(self, xref): ref = self.create_ref_statement() if xref.split(":")[0] not in cu.curie_map: # log this curie prefix not being found m = wdi_helpers.format_msg(self.id_curie, self.id_pid, self.qid, "curie prefix not found: {}".format(xref.split(":")[0])) wdi_core.WDItemEngine.log("WARNING", m) return None pid, ext_id = cu.parse_curie(xref) pid = self.helper.get_pid(pid) self.pids.add(pid) return wdi_core.WDExternalID(ext_id, pid, references=[ref])
def create(self, login, write=True, allow_new=True): # create or get qid # creates the primary external ID, the xrefs, instance of (if set), checks label, description, and aliases # not other properties (i.e. subclass), as these may require items existing that may not exist yet self._pre_create() assert self.id_curie s = self.create_statements() primary_ext_id_pid, primary_ext_id = cu.parse_curie(self.id_curie) primary_ext_id_pid = self.helper.get_pid(primary_ext_id_pid) assert primary_ext_id_pid in self.graph.APPEND_PROPS try: self.item = wdi_core.WDItemEngine( data=s, append_value=self.graph.APPEND_PROPS, fast_run=self.graph.FAST_RUN, fast_run_base_filter={primary_ext_id_pid: ''}, fast_run_use_refs=True, global_ref_mode='CUSTOM', ref_handler=self.ref_handler, mediawiki_api_url=self.mediawiki_api_url, sparql_endpoint_url=self.sparql_endpoint_url, core_props=self.graph.CORE_IDS, core_prop_match_thresh=.9 ) # assert the retrieved item doesn't already have a primary_ext_id id if self.item.wd_item_id: query = "select ?primary_ext_id where {{ wd:{} wdt:{} ?primary_ext_id }}".format(self.item.wd_item_id, primary_ext_id_pid) results = wdi_core.WDItemEngine.execute_sparql_query(query)['results']['bindings'] if results: existing_primary_ext_id = [x['primary_ext_id']['value'] for x in results] if self.id_curie not in existing_primary_ext_id: raise Exception( "conflicting primary_ext_id IDs: {} on {}".format(self.id_curie, self.item.wd_item_id)) if self.item.create_new_item and not allow_new: return None except Exception as e: traceback.print_exc() msg = wdi_helpers.format_msg(primary_ext_id, primary_ext_id_pid, None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg) return self.set_label(self.item) self.set_descr(self.item) self.set_aliases(self.item) # todo: I want to avoid this from happening: https://www.wikidata.org/w/index.php?title=Q4553565&diff=676750840&oldid=647941942 wdi_helpers.try_write(self.item, record_id=primary_ext_id, record_prop=primary_ext_id_pid, login=login, write=write) self.qid = self.item.wd_item_id
def run_shex_manifest(): manifest = jsonasobj.loads( requests.get( "https://raw.githubusercontent.com/SuLab/Genewiki-ShEx/master/pathways/reactome/manifest.json" ).text) for case in manifest: if case.data.startswith("Endpoint:"): sparql_endpoint = case.data.replace("Endpoint: ", "") schema = requests.get(case.schemaURL).text shex = ShExC(schema).schema evaluator = ShExEvaluator(schema=shex, debug=True) sparql_query = case.queryMap.replace("SPARQL '''", "").replace("'''@START", "") df = wdi_core.WDItemEngine.execute_sparql_query(sparql_query) for row in df["results"]["bindings"]: wdid = row["item"]["value"] slurpeddata = SlurpyGraph(sparql_endpoint) try: results = evaluator.evaluate(rdf=slurpeddata, focus=wdid, debug=False) for result in results: if result.result: print(str(result.focus) + ": INFO") msg = wdi_helpers.format_msg( wdid, wdid, None, 'CONFORMS', '') wdi_core.WDItemEngine.log("INFO", msg) else: msg = wdi_helpers.format_msg( wdid, wdid, None, '', '') wdi_core.WDItemEngine.log("ERROR", s) except RuntimeError: print( "Continue after 1 minute, no validation happened on" + wdid) continue
def create_item(self, fast_run=True, write=True): try: self.parse_external_ids() self.statements = self.create_statements() self.create_label() self.create_description() self.create_aliases() wd_item_protein = wdi_core.WDItemEngine( item_name=self.label, domain='proteins', data=self.statements, append_value=[PROPS['instance of'], PROPS['encoded by']], # PROPS['Ensembl Protein ID'], PROPS['RefSeq Protein ID']], fast_run=fast_run, fast_run_base_filter={ PROPS['UniProt ID']: '', PROPS['found in taxon']: self.organism_info['wdid'] }, fast_run_use_refs=True, ref_handler=update_retrieved_if_new, global_ref_mode="CUSTOM", core_props=core_props) wd_item_protein.set_label(self.label) wd_item_protein.set_description(self.description, lang='en') # remove the alias "protein" current_aliases = set(wd_item_protein.get_aliases()) aliases = current_aliases | set(self.aliases) if "protein" in aliases: aliases.remove("protein") wd_item_protein.set_aliases(aliases, append=False) self.status = wdi_helpers.try_write( wd_item_protein, self.external_ids['UniProt ID'], PROPS['UniProt ID'], self.login, write=write) self.protein_wdid = wd_item_protein.wd_item_id return wd_item_protein except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.external_ids['Entrez Gene ID'], PROPS['Entrez Gene ID'], None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg) self.status = msg return None
def main(retrieved, fast_run, write): login = wdi_login.WDLogin(WDUSER, WDPASS) temp = Graph() url = 'http://data.wikipathways.org/current/rdf' page = requests.get(url).text files = [] for link in BeautifulSoup(page, "lxml", parse_only=SoupStrainer('a')): address = str(link).split("\"") if len(address) > 1: filename = address[1].replace("./", "/") if len(filename) > 1: if filename not in files: if filename != "./": files.append(url + filename) wpids = [] for file in set(files): if "rdf-wp" in file: # get the most accurate file print(file) u = requests.get(file) with closing(u), zipfile.ZipFile(io.BytesIO(u.content)) as archive: for member in archive.infolist(): nt_content = archive.read(member) # print(nt_content) temp.parse(data=nt_content.decode(), format="turtle") print("size: " + str(len(temp))) wp_query = """prefix dcterm: <http://purl.org/dc/terms/> prefix wp: <http://vocabularies.wikipathways.org/wp#> SELECT DISTINCT ?wpid WHERE { ?s rdf:type <http://vocabularies.wikipathways.org/wp#Pathway> ; dcterm:identifier ?wpid ; ?p <http://vocabularies.wikipathways.org/wp#Curation:AnalysisCollection> ; wp:organism <http://purl.obolibrary.org/obo/NCBITaxon_9606> . }""" qres = temp.query(wp_query) for row in qres: print("%s" % row) wpids.append(str(row[0])) for pathway_id in wpids: try: run_one(pathway_id, retrieved, fast_run, write, login, temp) except Exception as e: traceback.print_exc() wdi_core.WDItemEngine.log( "ERROR", wdi_helpers.format_msg(pathway_id, PROPS['Wikipathways ID'], None, str(e), type(e)))
def validate_docs(docs, doc_type, external_id_prop): assert doc_type in {'eukaryotic', 'microbial'} if doc_type == "microbial": f = validate_doc_microbial else: f = validate_doc_eukaryotic for doc in docs: try: doc = f(doc) except ValueError as e: print(e) wdi_core.WDItemEngine.log("WARNING", wdi_helpers.format_msg(doc['_id'], external_id_prop, None, str(e), type(e))) continue yield doc
def run(self, records, total=None, fast_run=True, write=True): for record in tqdm(records, mininterval=2, total=total): entrez_gene = str(record['entrezgene']['@value']) if entrez_gene not in self.gene_wdid_mapping: wdi_core.WDItemEngine.log( "WARNING", format_msg(entrez_gene, "P351", None, "Gene item not found during protein creation", None)) continue gene_wdid = self.gene_wdid_mapping[entrez_gene] protein = Protein(record, self.organism_info, gene_wdid, self.login) protein.create_item(fast_run=fast_run, write=write) protein.make_gene_encodes(write=write)
def validate_docs(docs, doc_type, external_id_prop): assert doc_type in {'gene', 'protein', 'microbial'} if doc_type == "microbial": f = validate_doc_microbial else: f = functools.partial(validate_doc, doc_type=doc_type) for doc in docs: try: doc = f(doc) except AssertionError as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) wdi_core.WDItemEngine.log("WARNING", wdi_helpers.format_msg(doc['_id'], external_id_prop, None, str(e), type(e))) continue yield doc
def run(self, records, total=None, fast_run=True, write=True): records = self.filter(records) for record in tqdm(records, mininterval=2, total=total): # print(record['entrezgene']) gene = self.GENE_CLASS(record, self.organism_info, self.chr_num_wdid, self.login) try: gene.create_item(fast_run=fast_run, write=write) except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(gene.external_ids['Entrez Gene ID'], PROPS['Entrez Gene ID'], None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg) gene.status = msg if gene.status is not True: self.failed.append(gene.entrez)
def create_item(self, login=None, write=True): if self.deprecated: return None try: s = [] s.extend(self.create_xref_statements()) s.extend(self.create_main_statements_nodepend()) wd_item = wdi_core.WDItemEngine( item_name=self.lbl, data=s, domain=self.domain, append_value=[PROPS['subclass of'], PROPS['instance of']], fast_run=self.fast_run, fast_run_base_filter={self.primary_ext_prop_qid: ''}) if wd_item.get_label(lang="en") == "": wd_item.set_label(self.lbl, lang="en") current_descr = wd_item.get_description(lang='en') if current_descr.lower() in { "", self.default_label } and self.definition and len(self.definition) < 250: wd_item.set_description(description=self.definition, lang='en') elif current_descr.lower() == "": wd_item.set_description(description=self.default_label, lang='en') if self.synonyms is not None: wd_item.set_aliases(aliases=self.synonyms, lang='en', append=True) if self.wikilink is not None: wd_item.set_sitelink(site="enwiki", title=self.wikilink) wdi_helpers.try_write(wd_item, record_id=self.id_colon, record_prop=self.primary_ext_prop_qid, login=login, write=write) self.wd_item_id = wd_item.wd_item_id return wd_item except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.id_colon, self.primary_ext_prop_qid, None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def write_item(self, wd_item): if self.write: try: wdi_helpers.try_write(wd_item['item'], record_id=wd_item['record_id'], record_prop=wd_item['record_prop'], edit_summary='edit genetic association', login=self.login, write=self.write) except Exception as e: print(e) wdi_core.WDItemEngine.log( "ERROR", wdi_helpers.format_msg(wd_item['record_id'], wd_item['record_prop'], wd_item['item'].wd_item_id, str(e), type(e)))
def get_or_create_article(self, pmid): # check if exists in wikidata if pmid in self.pmid_wdid_map: return self.pmid_wdid_map[pmid] else: p = wdi_helpers.PubmedItem(pmid) if self.write: wdid = p.get_or_create(self.login) else: wdid = 'Q1' # Dummy ID wdi_core.WDItemEngine.log( "INFO", wdi_helpers.format_msg(pmid, PROPS['PubMed ID'], wdid, "CREATE")) self.pmid_wdid_map[pmid] = wdid return wdid
def create(self, write=True): if self.deprecated: return None try: self.create_xref_statements() self.s.extend(self.s_xref) self.create_main_statements() self.s.extend(self.s_main) wd_item = wdi_core.WDItemEngine( item_name=self.lbl, data=self.s, domain="diseases", append_value=[PROPS['subclass of'], PROPS['instance of']], fast_run=self.do_graph.fast_run, fast_run_base_filter={'P699': ''}) if wd_item.get_label(lang="en") == "": wd_item.set_label(self.lbl, lang="en") current_descr = wd_item.get_description(lang='en') if current_descr.lower() in { "", "human disease", "disease" } and self.definition and len(self.definition) < 250: wd_item.set_description(description=self.definition, lang='en') elif current_descr.lower() == "": wd_item.set_description(description="human disease", lang='en') if self.synonyms is not None: wd_item.set_aliases(aliases=self.synonyms, lang='en', append=True) if self.wikilink is not None: wd_item.set_sitelink(site="enwiki", title=self.wikilink) wdi_helpers.try_write(wd_item, record_id=self.id, record_prop='P699', login=self.do_graph.login, write=write) return wd_item except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.doid, 'P699', None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)