Beispiel #1
0
    def get_mapper(self):
        """Return mapper between label and Wikidata item.

        Query the Wikidata Query service to get Wikidata identifiers
        and associated labels and convert them to a dictionary.

        Returns
        -------
        mapper : dict
            Dictionary where the keys are labels associated with Wikidata
            Q identifiers.

        Notes
        -----
        This method queries the Wikidata Query Service with a static
        SPARQL query. It well take some time to complete, perhaps 30 seconds
        or more.

        In some cases a timeout may occur in the middle of a response,
        making the JSON return invalid. The method will try second time.
        If this also fails, then the method will raise an exception.

        """
        response = requests.get(config.get('servers', 'SPARQLEndPointURL'),
                                params={
                                    'query': TOPIC_LABELS_SPARQL,
                                    'format': 'json'
                                },
                                headers=self.headers)

        try:
            response_data = response.json()
        except JSONDecodeError:
            # In some cases a timeout may occur in the middle of a response,
            # making the JSON returned invalid.
            response = requests.get(config.get('servers', 'SPARQLEndPointURL'),
                                    params={
                                        'query': TOPIC_LABELS_SPARQL,
                                        'format': 'json'
                                    },
                                    headers=self.headers)
            try:
                response_data = response.json()
            except JSONDecodeError:
                # TODO: We may end here due to timeout or (perhaps?) invalid
                # JSON in the cache. It is unclear what we can do to escape
                # this problem other than wait. Here is made an empty response.
                response_data = {'results': {'bindings': []}}

        data = response_data['results']['bindings']

        mapper = {}
        for datum in data:
            mapper[datum['topic_label']['value']] \
                = datum['topic']['value'][31:]

        return mapper
Beispiel #2
0
def website_to_qs(url):
    """Convert URL for website to Wikidata ID.

    Parameters
    ----------
    url : str
        URL for official website.

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> url = ("https://papers.nips.cc/paper/"
    ...        "6498-online-and-differentially-private-tensor-decomposition")
    >>> qs = website_to_qs(url)
    >>> qs == ['Q46994097']
    True

    """
    query = 'SELECT ?work WHERE {{ ?work wdt:P856 <{url}> }}'.format(
        url=url.strip())

    url_ = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url_, params=params, headers=HEADERS)
    data = response.json()

    return [item['work']['value'][31:] for item in data['results']['bindings']]
Beispiel #3
0
def atomic_symbol_to_qs(symbol):
    """Look up a chemical element by atomic symbol and return a Wikidata ID.

    Parameters
    ----------
    symbol : str
        Atomic symbol.

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> atomic_symbol_to_qs('C') == ['Q623']
    True

    """
    # This query only matches on exact match
    query = """SELECT ?item
               WHERE {{ ?item wdt:P246 "{symbol}" }}""".format(symbol=symbol)
    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #4
0
def atomic_number_to_qs(atomic_number):
    """Look up a chemical element by atomic number and return a Wikidata ID.

    Parameters
    ----------
    atomic_number : str
        Atomic number.

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> atomic_number_to_qs('6') == ['Q623']
    True

    """
    # This query only matches on exact match
    query = """SELECT ?item
               WHERE {{ ?item wdt:P31 wd:Q11344 ; wdt:P1086 ?number .
                 FILTER (STR(?number) = "{atomic_number}") }}""".format(
        atomic_number=atomic_number)
    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #5
0
def lipidmaps_to_qs(lmid):
    """Convert a LIPID MAPS identifier to Wikidata ID.

    Parameters
    ----------
    lmid : str
        LIPID MAPS identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> lipidmaps_to_qs('LMFA') == ['Q63433687']
    True
    >>> lipidmaps_to_qs('LMFA00000007') == ['Q27114894']
    True

    """
    # This query only matches on exact match
    query = """select ?item
               where {{ ?item wdt:P2063 "{lmid}" }}""".format(lmid=lmid)
    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #6
0
def arxiv_to_qs(arxiv):
    """Convert arxiv ID to Wikidata ID.

    Parameters
    ----------
    arxiv : str
        ArXiv identifier.

    Returns
    -------
    qs : list of str
        List of string with Wikidata IDs.

    Examples
    --------
    >>> arxiv_to_qs('1507.04180') == ['Q27036443']
    True

    """
    query = 'select ?work where {{ ?work wdt:P818 "{arxiv}" }}'.format(
        arxiv=escape_string(arxiv))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['work']['value'][31:] for item in data['results']['bindings']]
Beispiel #7
0
def issn_to_qs(issn):
    """Convert ISSN to Wikidata ID.

    Parameters
    ----------
    issn : str
        ISSN identifier as a string.

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> issn_to_qs('1533-7928') == ['Q1660383']
    True

    """
    query = 'select ?author where {{ ?author wdt:P236 "{issn}" }}'.format(
        issn=escape_string(issn))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [
        item['author']['value'][31:] for item in data['results']['bindings']
    ]
Beispiel #8
0
def inchikey_to_qs(inchikey):
    """Convert InChIKey to Wikidata ID.

    Parameters
    ----------
    inchikey : str
        inchikey identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> inchikey_to_qs('UHOVQNZJYSORNB-UHFFFAOYSA-N') == ['Q2270']
    True

    """
    # This query only matches on exact match
    query = """select ?item
               where {{ ?item wdt:P235 "{inchikey}" }}""".format(
        inchikey=escape_string(inchikey))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #9
0
def twitter_to_qs(twitter):
    """Convert Twitter account name to Wikidata ID.

    Parameters
    ----------
    twitter : str
        Twitter account identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> twitter_to_qs('utafrith') == ['Q8219']
    True

    """
    # This query only matches on exact match
    query = """select ?item
               where {{ ?item wdt:P2002 "{twitter}" }}""".format(
        twitter=escape_string(twitter))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #10
0
def cordis_to_qs(cordis):
    """Convert CORDIS project ID to Wikidata ID.

    Parameters
    ----------
    cordis : str
        CORDIS identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> cordis_to_qs('604134') == ['Q27990087']
    True

    """
    # This query only matches on exact match
    query = """select ?item
               where {{ ?item wdt:P3400 "{cordis}" }}""".format(
        cordis=escape_string(cordis))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #11
0
def viaf_to_qs(viaf):
    """Convert VIAF identifier to Wikidata ID.

    Parameters
    ----------
    viaf : str
        VIAF identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> viaf_to_qs('59976288') == ['Q3259614']
    True

    """
    query = 'select ?author where {{ ?author wdt:P214 "{viaf}" }}'.format(
        viaf=escape_string(viaf))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [
        item['author']['value'][31:] for item in data['results']['bindings']
    ]
Beispiel #12
0
def mesh_to_qs(meshid):
    """Convert MeSH ID to Wikidata ID.

    Parameters
    ----------
    meshid : str
        MeSH identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> mesh_to_qs('D028441') == ['Q33659470']
    True

    """
    query = 'select ?cmp where {{ ?cmp wdt:P486 "{meshid}" }}'.format(
        meshid=meshid)

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['cmp']['value'][31:] for item in data['results']['bindings']]
Beispiel #13
0
def cas_to_qs(cas):
    """Convert a CAS registry number to Wikidata ID.

    Parameters
    ----------
    cas : str
        CAS registry number

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> cas_to_qs('50-00-0') == ['Q161210']
    True

    """
    # This query only matches on exact match
    query = """select ?item
               where {{ ?item wdt:P231 "{cas}" }}""".format(cas=cas)

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #14
0
def orcid_to_qs(orcid):
    """Convert orcid to Wikidata ID.

    Parameters
    ----------
    orcid : str
        ORCID identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> orcid_to_qs('0000-0001-6128-3356') == ['Q20980928']
    True

    """
    query = 'select ?author where {{ ?author wdt:P496 "{orcid}" }}'.format(
        orcid=escape_string(orcid))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [
        item['author']['value'][31:] for item in data['results']['bindings']
    ]
Beispiel #15
0
def ror_to_qs(rorid):
    """Convert a ROR identifier to Wikidata ID.

    Wikidata Query Service is used to resolve the ROR identifier.

    Parameters
    ----------
    rorid : str
        ROR identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> ror_to_qs('038321296') == ['Q5566337']
    True

    """
    query = 'select ?work where {{ ?work wdt:P6782 "{rorid}" }}'.format(
        rorid=rorid)

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['work']['value'][31:] for item in data['results']['bindings']]
Beispiel #16
0
def wikipathways_to_qs(wpid):
    """Convert a WikiPathways identifier to Wikidata ID.

    Wikidata Query Service is used to resolve the WikiPathways identifier.

    Parameters
    ----------
    wpid : str
        WikiPathways identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> wikipathways_to_qs('WP111') == ['Q28031254']
    True

    """
    query = ('select ?work where {{ VALUES ?wpid {{ "{wpid}" }} '
             '?work wdt:P2410 ?wpid }}').format(wpid=wpid)

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['work']['value'][31:] for item in data['results']['bindings']]
Beispiel #17
0
def pubmed_to_qs(pmid):
    """Convert a PubMed identifier to Wikidata ID.

    Wikidata Query Service is used to resolve the PubMed identifier.

    The PubMed identifier string is converted to uppercase before any
    query is made.

    Parameters
    ----------
    pmid : str
        PubMed identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> pubmed_to_qs('29029422') == ['Q42371516']
    True

    """
    query = 'select ?work where {{ ?work wdt:P698 "{pmid}" }}'.format(
        pmid=pmid)

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['work']['value'][31:] for item in data['results']['bindings']]
Beispiel #18
0
def pubchem_to_qs(cid):
    """Convert a PubChem compound identifier (CID) to Wikidata ID.

    Wikidata Query Service is used to resolve the PubChem identifier.

    Parameters
    ----------
    pmid : str
        PubChem compound identifier (CID)

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> pubchem_to_qs('14123361') == ['Q289372']
    True

    """
    query = 'select ?chemical where {{ ?chemical wdt:P662 "{cid}" }}'.format(
        cid=cid)

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [
        item['chemical']['value'][31:] for item in data['results']['bindings']
    ]
Beispiel #19
0
def github_to_qs(github):
    """Convert GitHub account name to Wikidata ID.

    Parameters
    ----------
    github : str
        github account identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> github_to_qs('vrandezo') == ['Q18618629']
    True

    """
    # This query only matches on exact match
    query = """select ?item
               where {{ ?item wdt:P2037 "{github}" }}""".format(
        github=escape_string(github))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['item']['value'][31:] for item in data['results']['bindings']]
Beispiel #20
0
def q_to_label(q, language='en'):
    """Get label for Q item.

    Parameters
    ----------
    q : str
        String with Wikidata Q item.
    language : str
        String with language identifier

    Returns
    -------
    label : str
        String with label corresponding to Wikidata item.

    Examples
    --------
    >>> q_to_label('Q80') == "Tim Berners-Lee"
    True

    """
    query = """SELECT ?label WHERE {{ wd:{q} rdfs:label ?label .
        FILTER (LANG(?label) = "{language}") }}""".format(q=q,
                                                          language=language)

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    results = data['results']['bindings']
    if len(results) == 1:
        return results[0]['label']['value']
    else:
        return None
Beispiel #21
0
def q_to_bibliography_templates(q):
    """Construct bibliography for Wikidata based on Wikidata identifier.

    Parameters
    ----------
    q : str
        String with Wikidata item identifier.

    Returns
    -------
    wikitext : str
        String with wikipedia template formatted bibliography.

    References
    ----------
    https://en.wikipedia.org/wiki/Template:Cite_journal

    Examples
    --------
    >>> wikitext = q_to_bibliography_templates("Q28923929")
    >>> wikitext.find('Cite journal') != -1
    True

    """
    query = BIBLIOGRAPHY_SPARQL_QUERY.format(q=q)
    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params)
    data = response.json()

    wikitext = ('<!-- Generated with scholia.wikipedia '
                'q-to-bibliography-templates {q}\n').format(q=q)
    wikitext += ('     or http://tools.wmflabs.org/scholia/'
                 'q-to-bibliography-templates?q={q} -->\n').format(q=q)
    for item in data['results']['bindings']:
        if (_value(item, 'type').endswith('Q5707594')
                or _value(item, 'type').endswith('Q17928402')):
            # news article or blog post
            wikitext += CITE_NEWS_TEMPLATE.format(
                title=_value(item, 'title'),
                work=_value(item, 'venueLabel'),
                date=_value(item, 'date').split('T')[0],
                url=_value(item, 'url'),
            )
        else:
            wikitext += CITE_JOURNAL_TEMPLATE.format(
                title=_value(item, 'title'),
                journal=_value(item, 'venueLabel'),
                volume=_value(item, 'volume'),
                issue=_value(item, 'issue'),
                date=_value(item, 'date').split('T')[0],
                pages=_value(item, 'pages'),
                license=_value(item, 'license'),
                doi=_value(item, 'doi'),
                url=_value(item, 'url'),
            )

    return wikitext
Beispiel #22
0
def iso639_to_q(language):
    """Convert ISO639 to Q item.

    Arguments
    ---------
    language : str
        language represented as a ISO 639 format

    Returns
    -------
    q : str or None
        Language represented as a q identifier.

    Examples
    --------
    >>> iso639_to_q('en') == 'Q1860'
    True

    >>> iso639_to_q('dan') == 'Q9035'
    True

    """
    if language in ISO639_TO_Q:
        return ISO639_TO_Q[language]

    # Fallback on query
    if len(language) == 2:
        query = "SELECT * {{ ?language wdt:P218 '{}' }}".format(language)
    elif len(language) == 3:
        query = "SELECT * {{ ?language wdt:P219 '{}' }}".format(language)
    else:
        raise ValueError('ISO639 language code not recognized')

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()
    qs = [
        item['language']['value'][31:] for item in data['results']['bindings']
    ]
    if len(qs) == 1:
        return qs[0]
    elif len(qs) == 0:
        return None
    else:
        # There shouldn't be multiple matching items, so it is not clear
        # what we can do here.
        raise QueryResultError("Multiple matching language found for "
                               "ISO639 code")
Beispiel #23
0
def count_scientific_articles():
    """Return count for the number of scientific articles.

    Returns
    -------
    count : int
        #Number of scientific articles in Wikidata.

    """
    query = """
        SELECT (COUNT(*) AS ?count) WHERE { [] wdt:P31 wd:Q13442814 }"""

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return int(data['results']['bindings'][0]['count']['value'])
Beispiel #24
0
def query_to_bindings(query):
    """Return response bindings from SPARQL query.

    Query the Wikidata Query Service with the given query and return the
    response data as binding.

    Parameters
    ----------
    query : str
        SPARQL query as string

    Returns
    -------
    bindings : list
        Data as list of dicts.

    """
    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return data['results']['bindings']
Beispiel #25
0
def search(query, limit=10):
    """Search Wikidata.

    Parameters
    ----------
    query : str
        Query string.
    limit : int, optional
        Number of maximum search results to return.

    Returns
    -------
    result : list of dicts

    """
    # Query the Wikidata API
    response = requests.get(config.get('servers', 'webservice_url'),
                            params={
                                'action': 'query',
                                'list': 'search',
                                'srlimit': limit,
                                'srsearch': query,
                                'srwhat': 'text',
                                'format': 'json',
                            },
                            headers=HEADERS)

    # Convert the response
    response_data = response.json()
    items = response_data['query']['search']
    results = [{
        'q': item['title'],
        'description': item['snippet']
    } for item in items]

    return results
Beispiel #26
0
def doi_to_qs(doi):
    """Convert DOI to Wikidata ID.

    Wikidata Query Service is used to resolve the DOI.

    The DOI string is converted to uppercase before any
    query is made. Uppercase DOIs are default in Wikidata.

    Parameters
    ----------
    doi : str
        DOI identifier

    Returns
    -------
    qs : list of str
        List of strings with Wikidata IDs.

    Examples
    --------
    >>> doi_to_qs('10.1186/S13321-016-0161-3') == ['Q26899110']
    True

    >>> doi_to_qs('10.1016/j.stem.2016.02.016') == ['Q23008981']
    True

    """
    query = 'select ?work where {{ ?work wdt:P356 "{doi}" }}'.format(
        doi=escape_string(doi.upper()))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    data = response.json()

    return [item['work']['value'][31:] for item in data['results']['bindings']]
Beispiel #27
0
def q_to_class(q):
    """Return Scholia class of Wikidata item.

    The 'class', i.e., which kind of instance, the item is by querying
    the Wikidata Query Service.

    Parameters
    ----------
    q : str
        Wikidata item identifier.

    Returns
    -------
    class_ : 'author', 'venue', 'organization', ...
        Scholia class represented as a string.

    Notes
    -----
    The Wikidata Query Service will be queried for P31 value. The value
    is compared against a set of hardcoded matches.

    """
    query = 'SELECT ?class {{ wd:{q} wdt:P31 ?class }}'.format(
        q=escape_string(q))

    url = config.get('servers', 'SPARQLEndPointURL')
    params = {'query': query, 'format': 'json'}
    response = requests.get(url, params=params, headers=HEADERS)
    try:
        data = response.json()
    except JSONDecodeError:
        # If the Wikidata MediaWiki API does not return a proper
        # response, then fallback on nothing.
        classes = []
    else:
        classes = [
            item['class']['value'][31:] for item in data['results']['bindings']
        ]

    # Hard-coded matching match
    if ('Q5' in classes):  # human
        class_ = 'author'
    elif ('Q30612' in classes):  # clinical trial
        class_ = 'clinical_trial'
    elif set(classes).intersection([
            'Q277759',  # book series
            'Q2217301',  # serial (publication series)
            'Q27785883',  # conference proceedings series
    ]):
        class_ = 'series'
    elif set(classes).intersection([
            'Q737498',  # academic journal
            'Q5633421',  # scientific journal
            'Q1143604',  # proceedings
    ]):
        class_ = 'venue'
    elif ('Q157031' in classes or  # foundation
          'Q10498148' in classes):  # research council
        class_ = 'sponsor'
    elif ('Q2085381' in classes or  # publisher
          'Q479716' in classes):  # university publisher
        class_ = 'publisher'
    elif set(classes).intersection([
            'Q8054',  # protein
    ]):
        class_ = 'protein'
    elif set(classes).intersection([
            'Q170584',  # project
            'Q1298668',  # research project
    ]):
        class_ = 'project'
    elif set(classes).intersection([
            'Q7187',  # gene
    ]):
        class_ = 'gene'
    elif set(classes).intersection([
            'Q571',  # book
            'Q191067',  # article
            'Q253623',  # patent
            'Q580922',  # preprint
            'Q1980247',  # chapter
            'Q3331189',  # edition
            'Q5707594',  # news article
            'Q10870555',  # report
            'Q10885494',  # scientific conference paper
            'Q13442814',  # scientific article
            'Q21481766',  # academic chapter
            'Q47461344',  # written work
            'Q54670950',  # conference poster
            'Q58632367',  # conference abstract
    ]):
        class_ = 'work'
    elif set(classes).intersection([
            'Q7191',  # Nobel prize
            'Q193622',  # order
            'Q230788',  # grant
            'Q378427',  # litarary award
            'Q618779',  # award
            'Q1364556',  # music award
            'Q1407225',  # television award
            'Q1709894',  # journalism award
            'Q1792571',  # art prize
            'Q1829324',  # architecture award
            'Q4220917',  # film award
            'Q11448906',  # science prize
            'Q15383322',  # culture award
    ]):
        class_ = 'award'
    elif set(classes).intersection([
            'Q3918',  # university
            'Q31855',  # research institute
            'Q38723',  # higher education institution
            'Q414147',  # academy of sciences
            'Q484652',  # international organization
            'Q748019',  # scientific society
            'Q875538',  # public university
            'Q902104',  # private university
            'Q955824',  # learned society
            'Q1371037',  # technical university
            'Q2467461',  # university department
            'Q3354859',  # collegiate university
            'Q4358176',  # council
            'Q7315155',  # research center
            'Q15936437',  # research university
            'Q23002054',  # "private not-for-profit educational"
            'Q29300714',  # international association
    ]):
        class_ = 'organization'
    elif set(classes).intersection([
            'Q15275719',  # recurrent event
            'Q15900647',  # conference series
            'Q47258130',  # scientific conference series
            'Q47459256',  # academic workshop series
    ]):
        class_ = 'event_series'
    elif set(classes).intersection([
            'Q1656682',  # event
            'Q27968055',  # recurrent event edition (event in a series)
            'Q52260246',  # scientific event
    ]):
        class_ = 'event'
    elif set(classes).intersection([
            'Q12136',  # disease
            'Q389735',  # cardiovascular system disease
            'Q18965518',  # artery disease
    ]):
        class_ = 'disease'
    elif set(classes).intersection([
            'Q11173',  # chemical compound
            'Q36496',  # ion
            'Q79529',  # chemical substance
            'Q407595',  # metabolite
            'Q2393187',  # molecular entity
    ]):
        class_ = 'chemical'
    elif set(classes).intersection([
            'Q11344',  # chemical element
    ]):
        class_ = 'chemical_element'
    elif set(classes).intersection([
            'Q15711994',  # family of isomeric compounds
            'Q17339814',  # group or class of chemical substances
            'Q47154513',  # structural class of chemical compounds
            'Q55499636',  # pharmacological class of chemical compounds
            'Q55640599',  # group of ions
            'Q55662456',  # group of ortho, meta, para isomers
            'Q55662548',  # pair of cis-trans isomers
            'Q55662747',  # pair of enantiomers
            'Q55663030',  # pair of enantiomeric ions
            'Q56256086',  # group of chemical compounds
            'Q56256173',  # class of chemical compounds with similar
            # applications or functions
            'Q59199015',  # group of stereoisomers
    ]):
        class_ = 'chemical_class'
    elif set(classes).intersection([
            'Q4915012',  # biological pathway
    ]):
        class_ = 'pathway'
    elif set(classes).intersection([
            'Q16521',  # taxon
    ]):
        class_ = 'taxon'
    elif set(classes).intersection([
            'Q46855',  # hackathon
            'Q625994',  # conference
            'Q2020153',  # scientific conference
            'Q40444998',  # akademic workshop
    ]):
        class_ = 'event'
    elif set(classes).intersection([
            'Q7397',  # software
            'Q1172284',  # dataset
            'Q1639024',  # mathematical software
            'Q21127166',  # Java software library
            'Q21129801',  # natural language processing toolkit
            'Q22811662',  # image database
            'Q24529812',  # statistical package
    ]):
        class_ = 'use'
    else:
        query = 'select ?class where {{ wd:{q} wdt:P279+ ?class }}'.format(
            q=escape_string(q))

        url = config.get('servers', 'SPARQLEndPointURL')
        params = {'query': query, 'format': 'json'}
        response = requests.get(url, params=params, headers=HEADERS)
        data = response.json()
        parents = [
            item['class']['value'][31:] for item in data['results']['bindings']
        ]

        if set(parents).intersection([
                'Q11173',  # chemical compound
                'Q79529',  # chemical substance
        ]):
            class_ = 'chemical_class'
        else:
            class_ = 'topic'

    return class_
Beispiel #28
0
USER_AGENT = 'Scholia'

HEADERS = {'User-Agent': USER_AGENT}

PAPER_TO_Q_QUERY = u("""
SELECT ?paper WHERE {{
  OPTIONAL {{ ?label rdfs:label "{label}"@en . }}
  OPTIONAL {{ ?title wdt:P1476 "{title}"@en . }}
  OPTIONAL {{ ?url wdt:P953 <{url}> . }}
  BIND(COALESCE(?full_text_url, ?url, ?label, ?title) AS ?paper)
}}
""")

# SPARQL Endpoint for Wikidata Query Service
WDQS_URL = config.get('servers', 'SPARQLEndPointURL')


def paper_to_q(paper):
    """Find Q identifier for paper.

    Parameters
    ----------
    paper : dict
        Paper represented as dictionary.

    Returns
    -------
    q : str or None
        Q identifier in Wikidata. None is returned if the paper is not found.
Beispiel #29
0
def search_article_titles(q, search_string=None):
    """Search articles with q item.

    Parameters
    ----------
    q : str
        String with Wikidata Q item.
    search_string : str, optional
        String with query string. If it is not provided then the label of
        q items is used as the query string.

    Returns
    -------
    results : list of dict
        List of dicts with query result.

    Notes
    -----
    This function uses the Egon Willighagen trick with iterating
    over batches of 500'000 thousand articles and performing a search
    in the (scientific) article title for the query string via the `CONTAINS`
    SPARQL function. Case is ignored.

    """
    if search_string is None:
        search_string = q_to_label(q)

    query_template = """
      SELECT
        ?article ?title
      WITH {{
        SELECT ?article WHERE {{
          ?article wdt:P31 wd:Q13442814
        }}
        LIMIT {batch_size}
        OFFSET {offset}
      }} AS %results
      WHERE {{
        INCLUDE %results
        ?article wdt:P1476 ?title .
        MINUS {{ ?article wdt:P921 / wdt:P279* wd:{q} }}
        FILTER (CONTAINS(LCASE(?title), "{label}"))
      }}"""

    # Number of articles and a bit more to account for possible
    # addition during query.
    article_count = count_scientific_articles() + 1000

    url = config.get('servers', 'SPARQLEndPointURL')

    batch_size = 500000
    loops = article_count // batch_size + 1

    results = []
    for loop in range(loops):
        offset = loop * batch_size
        query = query_template.format(batch_size=batch_size,
                                      offset=offset,
                                      label=search_string.lower(),
                                      q=q)

        params = {'query': query, 'format': 'json'}
        response = requests.get(url, params=params, headers=HEADERS)
        data = response.json()
        batch_results = [{
            'title': item['title']['value'],
            'q': item['article']['value'][31:]
        } for item in data['results']['bindings']]
        results.extend(batch_results)
    return results