def get_mapper(self): """Return mapper between label and Wikidata item. Query the Wikidata Query service to get Wikidata identifiers and associated labels and convert them to a dictionary. Returns ------- mapper : dict Dictionary where the keys are labels associated with Wikidata Q identifiers. Notes ----- This method queries the Wikidata Query Service with a static SPARQL query. It well take some time to complete, perhaps 30 seconds or more. In some cases a timeout may occur in the middle of a response, making the JSON return invalid. The method will try second time. If this also fails, then the method will raise an exception. """ response = requests.get(config.get('servers', 'SPARQLEndPointURL'), params={ 'query': TOPIC_LABELS_SPARQL, 'format': 'json' }, headers=self.headers) try: response_data = response.json() except JSONDecodeError: # In some cases a timeout may occur in the middle of a response, # making the JSON returned invalid. response = requests.get(config.get('servers', 'SPARQLEndPointURL'), params={ 'query': TOPIC_LABELS_SPARQL, 'format': 'json' }, headers=self.headers) try: response_data = response.json() except JSONDecodeError: # TODO: We may end here due to timeout or (perhaps?) invalid # JSON in the cache. It is unclear what we can do to escape # this problem other than wait. Here is made an empty response. response_data = {'results': {'bindings': []}} data = response_data['results']['bindings'] mapper = {} for datum in data: mapper[datum['topic_label']['value']] \ = datum['topic']['value'][31:] return mapper
def website_to_qs(url): """Convert URL for website to Wikidata ID. Parameters ---------- url : str URL for official website. Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> url = ("https://papers.nips.cc/paper/" ... "6498-online-and-differentially-private-tensor-decomposition") >>> qs = website_to_qs(url) >>> qs == ['Q46994097'] True """ query = 'SELECT ?work WHERE {{ ?work wdt:P856 <{url}> }}'.format( url=url.strip()) url_ = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url_, params=params, headers=HEADERS) data = response.json() return [item['work']['value'][31:] for item in data['results']['bindings']]
def atomic_symbol_to_qs(symbol): """Look up a chemical element by atomic symbol and return a Wikidata ID. Parameters ---------- symbol : str Atomic symbol. Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> atomic_symbol_to_qs('C') == ['Q623'] True """ # This query only matches on exact match query = """SELECT ?item WHERE {{ ?item wdt:P246 "{symbol}" }}""".format(symbol=symbol) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def atomic_number_to_qs(atomic_number): """Look up a chemical element by atomic number and return a Wikidata ID. Parameters ---------- atomic_number : str Atomic number. Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> atomic_number_to_qs('6') == ['Q623'] True """ # This query only matches on exact match query = """SELECT ?item WHERE {{ ?item wdt:P31 wd:Q11344 ; wdt:P1086 ?number . FILTER (STR(?number) = "{atomic_number}") }}""".format( atomic_number=atomic_number) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def lipidmaps_to_qs(lmid): """Convert a LIPID MAPS identifier to Wikidata ID. Parameters ---------- lmid : str LIPID MAPS identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> lipidmaps_to_qs('LMFA') == ['Q63433687'] True >>> lipidmaps_to_qs('LMFA00000007') == ['Q27114894'] True """ # This query only matches on exact match query = """select ?item where {{ ?item wdt:P2063 "{lmid}" }}""".format(lmid=lmid) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def arxiv_to_qs(arxiv): """Convert arxiv ID to Wikidata ID. Parameters ---------- arxiv : str ArXiv identifier. Returns ------- qs : list of str List of string with Wikidata IDs. Examples -------- >>> arxiv_to_qs('1507.04180') == ['Q27036443'] True """ query = 'select ?work where {{ ?work wdt:P818 "{arxiv}" }}'.format( arxiv=escape_string(arxiv)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['work']['value'][31:] for item in data['results']['bindings']]
def issn_to_qs(issn): """Convert ISSN to Wikidata ID. Parameters ---------- issn : str ISSN identifier as a string. Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> issn_to_qs('1533-7928') == ['Q1660383'] True """ query = 'select ?author where {{ ?author wdt:P236 "{issn}" }}'.format( issn=escape_string(issn)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [ item['author']['value'][31:] for item in data['results']['bindings'] ]
def inchikey_to_qs(inchikey): """Convert InChIKey to Wikidata ID. Parameters ---------- inchikey : str inchikey identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> inchikey_to_qs('UHOVQNZJYSORNB-UHFFFAOYSA-N') == ['Q2270'] True """ # This query only matches on exact match query = """select ?item where {{ ?item wdt:P235 "{inchikey}" }}""".format( inchikey=escape_string(inchikey)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def twitter_to_qs(twitter): """Convert Twitter account name to Wikidata ID. Parameters ---------- twitter : str Twitter account identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> twitter_to_qs('utafrith') == ['Q8219'] True """ # This query only matches on exact match query = """select ?item where {{ ?item wdt:P2002 "{twitter}" }}""".format( twitter=escape_string(twitter)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def cordis_to_qs(cordis): """Convert CORDIS project ID to Wikidata ID. Parameters ---------- cordis : str CORDIS identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> cordis_to_qs('604134') == ['Q27990087'] True """ # This query only matches on exact match query = """select ?item where {{ ?item wdt:P3400 "{cordis}" }}""".format( cordis=escape_string(cordis)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def viaf_to_qs(viaf): """Convert VIAF identifier to Wikidata ID. Parameters ---------- viaf : str VIAF identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> viaf_to_qs('59976288') == ['Q3259614'] True """ query = 'select ?author where {{ ?author wdt:P214 "{viaf}" }}'.format( viaf=escape_string(viaf)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [ item['author']['value'][31:] for item in data['results']['bindings'] ]
def mesh_to_qs(meshid): """Convert MeSH ID to Wikidata ID. Parameters ---------- meshid : str MeSH identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> mesh_to_qs('D028441') == ['Q33659470'] True """ query = 'select ?cmp where {{ ?cmp wdt:P486 "{meshid}" }}'.format( meshid=meshid) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['cmp']['value'][31:] for item in data['results']['bindings']]
def cas_to_qs(cas): """Convert a CAS registry number to Wikidata ID. Parameters ---------- cas : str CAS registry number Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> cas_to_qs('50-00-0') == ['Q161210'] True """ # This query only matches on exact match query = """select ?item where {{ ?item wdt:P231 "{cas}" }}""".format(cas=cas) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def orcid_to_qs(orcid): """Convert orcid to Wikidata ID. Parameters ---------- orcid : str ORCID identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> orcid_to_qs('0000-0001-6128-3356') == ['Q20980928'] True """ query = 'select ?author where {{ ?author wdt:P496 "{orcid}" }}'.format( orcid=escape_string(orcid)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [ item['author']['value'][31:] for item in data['results']['bindings'] ]
def ror_to_qs(rorid): """Convert a ROR identifier to Wikidata ID. Wikidata Query Service is used to resolve the ROR identifier. Parameters ---------- rorid : str ROR identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> ror_to_qs('038321296') == ['Q5566337'] True """ query = 'select ?work where {{ ?work wdt:P6782 "{rorid}" }}'.format( rorid=rorid) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['work']['value'][31:] for item in data['results']['bindings']]
def wikipathways_to_qs(wpid): """Convert a WikiPathways identifier to Wikidata ID. Wikidata Query Service is used to resolve the WikiPathways identifier. Parameters ---------- wpid : str WikiPathways identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> wikipathways_to_qs('WP111') == ['Q28031254'] True """ query = ('select ?work where {{ VALUES ?wpid {{ "{wpid}" }} ' '?work wdt:P2410 ?wpid }}').format(wpid=wpid) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['work']['value'][31:] for item in data['results']['bindings']]
def pubmed_to_qs(pmid): """Convert a PubMed identifier to Wikidata ID. Wikidata Query Service is used to resolve the PubMed identifier. The PubMed identifier string is converted to uppercase before any query is made. Parameters ---------- pmid : str PubMed identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> pubmed_to_qs('29029422') == ['Q42371516'] True """ query = 'select ?work where {{ ?work wdt:P698 "{pmid}" }}'.format( pmid=pmid) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['work']['value'][31:] for item in data['results']['bindings']]
def pubchem_to_qs(cid): """Convert a PubChem compound identifier (CID) to Wikidata ID. Wikidata Query Service is used to resolve the PubChem identifier. Parameters ---------- pmid : str PubChem compound identifier (CID) Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> pubchem_to_qs('14123361') == ['Q289372'] True """ query = 'select ?chemical where {{ ?chemical wdt:P662 "{cid}" }}'.format( cid=cid) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [ item['chemical']['value'][31:] for item in data['results']['bindings'] ]
def github_to_qs(github): """Convert GitHub account name to Wikidata ID. Parameters ---------- github : str github account identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> github_to_qs('vrandezo') == ['Q18618629'] True """ # This query only matches on exact match query = """select ?item where {{ ?item wdt:P2037 "{github}" }}""".format( github=escape_string(github)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['item']['value'][31:] for item in data['results']['bindings']]
def q_to_label(q, language='en'): """Get label for Q item. Parameters ---------- q : str String with Wikidata Q item. language : str String with language identifier Returns ------- label : str String with label corresponding to Wikidata item. Examples -------- >>> q_to_label('Q80') == "Tim Berners-Lee" True """ query = """SELECT ?label WHERE {{ wd:{q} rdfs:label ?label . FILTER (LANG(?label) = "{language}") }}""".format(q=q, language=language) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() results = data['results']['bindings'] if len(results) == 1: return results[0]['label']['value'] else: return None
def q_to_bibliography_templates(q): """Construct bibliography for Wikidata based on Wikidata identifier. Parameters ---------- q : str String with Wikidata item identifier. Returns ------- wikitext : str String with wikipedia template formatted bibliography. References ---------- https://en.wikipedia.org/wiki/Template:Cite_journal Examples -------- >>> wikitext = q_to_bibliography_templates("Q28923929") >>> wikitext.find('Cite journal') != -1 True """ query = BIBLIOGRAPHY_SPARQL_QUERY.format(q=q) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params) data = response.json() wikitext = ('<!-- Generated with scholia.wikipedia ' 'q-to-bibliography-templates {q}\n').format(q=q) wikitext += (' or http://tools.wmflabs.org/scholia/' 'q-to-bibliography-templates?q={q} -->\n').format(q=q) for item in data['results']['bindings']: if (_value(item, 'type').endswith('Q5707594') or _value(item, 'type').endswith('Q17928402')): # news article or blog post wikitext += CITE_NEWS_TEMPLATE.format( title=_value(item, 'title'), work=_value(item, 'venueLabel'), date=_value(item, 'date').split('T')[0], url=_value(item, 'url'), ) else: wikitext += CITE_JOURNAL_TEMPLATE.format( title=_value(item, 'title'), journal=_value(item, 'venueLabel'), volume=_value(item, 'volume'), issue=_value(item, 'issue'), date=_value(item, 'date').split('T')[0], pages=_value(item, 'pages'), license=_value(item, 'license'), doi=_value(item, 'doi'), url=_value(item, 'url'), ) return wikitext
def iso639_to_q(language): """Convert ISO639 to Q item. Arguments --------- language : str language represented as a ISO 639 format Returns ------- q : str or None Language represented as a q identifier. Examples -------- >>> iso639_to_q('en') == 'Q1860' True >>> iso639_to_q('dan') == 'Q9035' True """ if language in ISO639_TO_Q: return ISO639_TO_Q[language] # Fallback on query if len(language) == 2: query = "SELECT * {{ ?language wdt:P218 '{}' }}".format(language) elif len(language) == 3: query = "SELECT * {{ ?language wdt:P219 '{}' }}".format(language) else: raise ValueError('ISO639 language code not recognized') url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() qs = [ item['language']['value'][31:] for item in data['results']['bindings'] ] if len(qs) == 1: return qs[0] elif len(qs) == 0: return None else: # There shouldn't be multiple matching items, so it is not clear # what we can do here. raise QueryResultError("Multiple matching language found for " "ISO639 code")
def count_scientific_articles(): """Return count for the number of scientific articles. Returns ------- count : int #Number of scientific articles in Wikidata. """ query = """ SELECT (COUNT(*) AS ?count) WHERE { [] wdt:P31 wd:Q13442814 }""" url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return int(data['results']['bindings'][0]['count']['value'])
def query_to_bindings(query): """Return response bindings from SPARQL query. Query the Wikidata Query Service with the given query and return the response data as binding. Parameters ---------- query : str SPARQL query as string Returns ------- bindings : list Data as list of dicts. """ url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return data['results']['bindings']
def search(query, limit=10): """Search Wikidata. Parameters ---------- query : str Query string. limit : int, optional Number of maximum search results to return. Returns ------- result : list of dicts """ # Query the Wikidata API response = requests.get(config.get('servers', 'webservice_url'), params={ 'action': 'query', 'list': 'search', 'srlimit': limit, 'srsearch': query, 'srwhat': 'text', 'format': 'json', }, headers=HEADERS) # Convert the response response_data = response.json() items = response_data['query']['search'] results = [{ 'q': item['title'], 'description': item['snippet'] } for item in items] return results
def doi_to_qs(doi): """Convert DOI to Wikidata ID. Wikidata Query Service is used to resolve the DOI. The DOI string is converted to uppercase before any query is made. Uppercase DOIs are default in Wikidata. Parameters ---------- doi : str DOI identifier Returns ------- qs : list of str List of strings with Wikidata IDs. Examples -------- >>> doi_to_qs('10.1186/S13321-016-0161-3') == ['Q26899110'] True >>> doi_to_qs('10.1016/j.stem.2016.02.016') == ['Q23008981'] True """ query = 'select ?work where {{ ?work wdt:P356 "{doi}" }}'.format( doi=escape_string(doi.upper())) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() return [item['work']['value'][31:] for item in data['results']['bindings']]
def q_to_class(q): """Return Scholia class of Wikidata item. The 'class', i.e., which kind of instance, the item is by querying the Wikidata Query Service. Parameters ---------- q : str Wikidata item identifier. Returns ------- class_ : 'author', 'venue', 'organization', ... Scholia class represented as a string. Notes ----- The Wikidata Query Service will be queried for P31 value. The value is compared against a set of hardcoded matches. """ query = 'SELECT ?class {{ wd:{q} wdt:P31 ?class }}'.format( q=escape_string(q)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) try: data = response.json() except JSONDecodeError: # If the Wikidata MediaWiki API does not return a proper # response, then fallback on nothing. classes = [] else: classes = [ item['class']['value'][31:] for item in data['results']['bindings'] ] # Hard-coded matching match if ('Q5' in classes): # human class_ = 'author' elif ('Q30612' in classes): # clinical trial class_ = 'clinical_trial' elif set(classes).intersection([ 'Q277759', # book series 'Q2217301', # serial (publication series) 'Q27785883', # conference proceedings series ]): class_ = 'series' elif set(classes).intersection([ 'Q737498', # academic journal 'Q5633421', # scientific journal 'Q1143604', # proceedings ]): class_ = 'venue' elif ('Q157031' in classes or # foundation 'Q10498148' in classes): # research council class_ = 'sponsor' elif ('Q2085381' in classes or # publisher 'Q479716' in classes): # university publisher class_ = 'publisher' elif set(classes).intersection([ 'Q8054', # protein ]): class_ = 'protein' elif set(classes).intersection([ 'Q170584', # project 'Q1298668', # research project ]): class_ = 'project' elif set(classes).intersection([ 'Q7187', # gene ]): class_ = 'gene' elif set(classes).intersection([ 'Q571', # book 'Q191067', # article 'Q253623', # patent 'Q580922', # preprint 'Q1980247', # chapter 'Q3331189', # edition 'Q5707594', # news article 'Q10870555', # report 'Q10885494', # scientific conference paper 'Q13442814', # scientific article 'Q21481766', # academic chapter 'Q47461344', # written work 'Q54670950', # conference poster 'Q58632367', # conference abstract ]): class_ = 'work' elif set(classes).intersection([ 'Q7191', # Nobel prize 'Q193622', # order 'Q230788', # grant 'Q378427', # litarary award 'Q618779', # award 'Q1364556', # music award 'Q1407225', # television award 'Q1709894', # journalism award 'Q1792571', # art prize 'Q1829324', # architecture award 'Q4220917', # film award 'Q11448906', # science prize 'Q15383322', # culture award ]): class_ = 'award' elif set(classes).intersection([ 'Q3918', # university 'Q31855', # research institute 'Q38723', # higher education institution 'Q414147', # academy of sciences 'Q484652', # international organization 'Q748019', # scientific society 'Q875538', # public university 'Q902104', # private university 'Q955824', # learned society 'Q1371037', # technical university 'Q2467461', # university department 'Q3354859', # collegiate university 'Q4358176', # council 'Q7315155', # research center 'Q15936437', # research university 'Q23002054', # "private not-for-profit educational" 'Q29300714', # international association ]): class_ = 'organization' elif set(classes).intersection([ 'Q15275719', # recurrent event 'Q15900647', # conference series 'Q47258130', # scientific conference series 'Q47459256', # academic workshop series ]): class_ = 'event_series' elif set(classes).intersection([ 'Q1656682', # event 'Q27968055', # recurrent event edition (event in a series) 'Q52260246', # scientific event ]): class_ = 'event' elif set(classes).intersection([ 'Q12136', # disease 'Q389735', # cardiovascular system disease 'Q18965518', # artery disease ]): class_ = 'disease' elif set(classes).intersection([ 'Q11173', # chemical compound 'Q36496', # ion 'Q79529', # chemical substance 'Q407595', # metabolite 'Q2393187', # molecular entity ]): class_ = 'chemical' elif set(classes).intersection([ 'Q11344', # chemical element ]): class_ = 'chemical_element' elif set(classes).intersection([ 'Q15711994', # family of isomeric compounds 'Q17339814', # group or class of chemical substances 'Q47154513', # structural class of chemical compounds 'Q55499636', # pharmacological class of chemical compounds 'Q55640599', # group of ions 'Q55662456', # group of ortho, meta, para isomers 'Q55662548', # pair of cis-trans isomers 'Q55662747', # pair of enantiomers 'Q55663030', # pair of enantiomeric ions 'Q56256086', # group of chemical compounds 'Q56256173', # class of chemical compounds with similar # applications or functions 'Q59199015', # group of stereoisomers ]): class_ = 'chemical_class' elif set(classes).intersection([ 'Q4915012', # biological pathway ]): class_ = 'pathway' elif set(classes).intersection([ 'Q16521', # taxon ]): class_ = 'taxon' elif set(classes).intersection([ 'Q46855', # hackathon 'Q625994', # conference 'Q2020153', # scientific conference 'Q40444998', # akademic workshop ]): class_ = 'event' elif set(classes).intersection([ 'Q7397', # software 'Q1172284', # dataset 'Q1639024', # mathematical software 'Q21127166', # Java software library 'Q21129801', # natural language processing toolkit 'Q22811662', # image database 'Q24529812', # statistical package ]): class_ = 'use' else: query = 'select ?class where {{ wd:{q} wdt:P279+ ?class }}'.format( q=escape_string(q)) url = config.get('servers', 'SPARQLEndPointURL') params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() parents = [ item['class']['value'][31:] for item in data['results']['bindings'] ] if set(parents).intersection([ 'Q11173', # chemical compound 'Q79529', # chemical substance ]): class_ = 'chemical_class' else: class_ = 'topic' return class_
USER_AGENT = 'Scholia' HEADERS = {'User-Agent': USER_AGENT} PAPER_TO_Q_QUERY = u(""" SELECT ?paper WHERE {{ OPTIONAL {{ ?label rdfs:label "{label}"@en . }} OPTIONAL {{ ?title wdt:P1476 "{title}"@en . }} OPTIONAL {{ ?url wdt:P953 <{url}> . }} BIND(COALESCE(?full_text_url, ?url, ?label, ?title) AS ?paper) }} """) # SPARQL Endpoint for Wikidata Query Service WDQS_URL = config.get('servers', 'SPARQLEndPointURL') def paper_to_q(paper): """Find Q identifier for paper. Parameters ---------- paper : dict Paper represented as dictionary. Returns ------- q : str or None Q identifier in Wikidata. None is returned if the paper is not found.
def search_article_titles(q, search_string=None): """Search articles with q item. Parameters ---------- q : str String with Wikidata Q item. search_string : str, optional String with query string. If it is not provided then the label of q items is used as the query string. Returns ------- results : list of dict List of dicts with query result. Notes ----- This function uses the Egon Willighagen trick with iterating over batches of 500'000 thousand articles and performing a search in the (scientific) article title for the query string via the `CONTAINS` SPARQL function. Case is ignored. """ if search_string is None: search_string = q_to_label(q) query_template = """ SELECT ?article ?title WITH {{ SELECT ?article WHERE {{ ?article wdt:P31 wd:Q13442814 }} LIMIT {batch_size} OFFSET {offset} }} AS %results WHERE {{ INCLUDE %results ?article wdt:P1476 ?title . MINUS {{ ?article wdt:P921 / wdt:P279* wd:{q} }} FILTER (CONTAINS(LCASE(?title), "{label}")) }}""" # Number of articles and a bit more to account for possible # addition during query. article_count = count_scientific_articles() + 1000 url = config.get('servers', 'SPARQLEndPointURL') batch_size = 500000 loops = article_count // batch_size + 1 results = [] for loop in range(loops): offset = loop * batch_size query = query_template.format(batch_size=batch_size, offset=offset, label=search_string.lower(), q=q) params = {'query': query, 'format': 'json'} response = requests.get(url, params=params, headers=HEADERS) data = response.json() batch_results = [{ 'title': item['title']['value'], 'q': item['article']['value'][31:] } for item in data['results']['bindings']] results.extend(batch_results) return results