Example #1
0
def get_url_csl_item_greycite(url: str) -> CSLItem:
    """
    Uses Greycite which has experiened uptime problems in the past.
    API calls seem to take at least 15 seconds. Browser requests are much
    faster. Setting header did not have an effect. Consider mimicking browser
    using selenium.

    More information on Greycite at:
    http://greycite.knowledgeblog.org/
    http://knowledgeblog.org/greycite
    https://arxiv.org/abs/1304.7151
    https://git.io/v9N2C
    """
    import requests

    from manubot.util import get_manubot_user_agent

    headers = {
        "Connection":
        "close",  # https://github.com/kennethreitz/requests/issues/4023
        "User-Agent": get_manubot_user_agent(),
    }
    response = requests.get("http://greycite.knowledgeblog.org/json",
                            params={"uri": url},
                            headers=headers)
    # Some Greycite responses were valid JSON besides for an error appended
    # like "<p>*** Date set from uri<p>" or "<p>*** fetch error : 404<p>".
    pattern = re.compile(r"<p>\*\*\*.*<p>")
    text = pattern.sub("", response.text)
    csl_item = json.loads(text)
    csl_item["type"] = "webpage"
    return csl_item
Example #2
0
def get_pubmed_csl_item(pmid):
    """
    Query NCBI E-Utilities to create CSL Items for PubMed IDs.

    https://github.com/manubot/manubot/issues/21
    https://github.com/ncbi/citation-exporter/issues/3#issuecomment-355313143
    """
    pmid = str(pmid)
    params = {"db": "pubmed", "id": pmid, "rettype": "full"}
    headers = {"User-Agent": get_manubot_user_agent()}
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    with _get_eutils_rate_limiter():
        response = requests.get(url, params, headers=headers)
    try:
        element_tree = xml.etree.ElementTree.fromstring(response.text)
        (element_tree, ) = list(element_tree)
    except Exception as error:
        logging.error(
            f"Error fetching PubMed metadata for {pmid}.\n"
            f"Invalid XML response from {response.url}:\n{response.text}")
        raise error
    try:
        csl_item = csl_item_from_pubmed_article(element_tree)
    except Exception as error:
        msg = f"Error parsing the following PubMed metadata for PMID {pmid}:\n{response.text}"
        logging.error(msg)
        raise error
    return csl_item
Example #3
0
def get_pubmed_citeproc(pmid):
    """
    Query NCBI E-Utilities to create CSL Items for PubMed IDs.

    https://github.com/greenelab/manubot/issues/21
    https://github.com/ncbi/citation-exporter/issues/3#issuecomment-355313143
    """
    pmid = str(pmid)
    params = {
        'db': 'pubmed',
        'id': pmid,
        'rettype': 'full',
    }
    headers = {
        'User-Agent': get_manubot_user_agent(),
    }
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    with _get_eutils_rate_limiter():
        response = requests.get(url, params, headers=headers)
    try:
        element_tree = xml.etree.ElementTree.fromstring(response.text)
        element_tree, = list(element_tree)
    except Exception as error:
        logging.error(
            f'Error fetching PubMed metadata for {pmid}.\n'
            f'Invalid XML response from {response.url}:\n{response.text}')
        raise error
    try:
        citeproc = citeproc_from_pubmed_article(element_tree)
    except Exception as error:
        msg = f'Error parsing the following PubMed metadata for PMID {pmid}:\n{response.text}'
        logging.error(msg)
        raise error
    return citeproc
Example #4
0
def get_pmid_for_doi(doi):
    """
    Query NCBI's E-utilities to retrieve the PMID for a DOI.
    """
    assert isinstance(doi, str)
    assert doi.startswith('10.')
    params = {
        'db': 'pubmed',
        'term': f'{doi}[DOI]',
    }
    headers = {
        'User-Agent': get_manubot_user_agent(),
    }
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    with _get_eutils_rate_limiter():
        response = requests.get(url, params, headers=headers)
    if not response.ok:
        logging.warning(
            f'Status code {response.status_code} querying {response.url}\n')
        return None
    try:
        element_tree = xml.etree.ElementTree.fromstring(response.text)
    except Exception as error:
        logging.warning(f'Error in ESearch XML for DOI: {doi}.\n'
                        f'Response from {response.url}:\n{response.text}')
        return None
    id_elems = element_tree.findall('IdList/Id')
    if len(id_elems) != 1:
        logging.debug(f'No PMIDs found for {doi}.\n'
                      f'Response from {response.url}:\n{response.text}')
        return None
    id_elem, = id_elems
    return id_elem.text
Example #5
0
def export_as_csl(zotero_data: ZoteroData) -> CSLItems:
    """
    Export Zotero JSON data to CSL JSON using a translation-server /export query.
    Performs a similar query to the following curl command:
    ```
    curl --verbose \
      --data @items.json \
      --header 'Content-Type: application/json' \
      'https://translate.manubot.org/export?format=csljson'
    ```
    """
    api_url = f"{base_url}/export"
    params = {"format": "csljson"}
    headers = {"User-Agent": get_manubot_user_agent()}
    response = requests.post(api_url,
                             params=params,
                             headers=headers,
                             json=zotero_data)
    if not response.ok:
        message = f"export_as_csl: translation-server returned status code {response.status_code}"
        logging.warning(
            f"{message} with the following output:\n{response.text}")
        raise requests.HTTPError(message)
    try:
        csl_items = response.json()
    except Exception as error:
        logging.warning(
            f"Error parsing export_as_csl output as JSON:\n{response.text}")
        raise error
    return csl_items
Example #6
0
def get_doi_citeproc(doi):
    """
    Use Content Negotioation (http://citation.crosscite.org/docs.html) to
    retrieve the citeproc JSON citation for a DOI.
    """
    url = 'https://doi.org/' + urllib.request.quote(doi)
    header = {
        'Accept': 'application/vnd.citationstyles.csl+json',
        'User-Agent': get_manubot_user_agent(),
    }
    response = requests.get(url, headers=header)
    try:
        citeproc = response.json()
    except Exception as error:
        logging.error(
            f'Error fetching metadata for doi:{doi}.\n'
            f'Invalid response from {response.url}:\n{response.text}')
        raise error
    citeproc['URL'] = f'https://doi.org/{doi}'
    short_doi_url = get_short_doi_url(doi)
    if short_doi_url:
        citeproc['URL'] = short_doi_url
    try:
        citeproc.update(get_pubmed_ids_for_doi(doi))
    except Exception:
        logging.warning(f'Error calling get_pubmed_ids_for_doi for {doi}',
                        exc_info=True)
    return citeproc
Example #7
0
def web_query(url: str) -> ZoteroData:
    """
    Return Zotero citation metadata for a URL as a list containing a single element that
    is a dictionary with the URL's metadata.
    """
    headers = {
        "User-Agent": get_manubot_user_agent(),
        "Content-Type": "text/plain"
    }
    params = {"single": 1}
    api_url = f"{base_url}/web"
    response = requests.post(api_url,
                             params=params,
                             headers=headers,
                             data=str(url))
    try:
        zotero_data = response.json()
    except Exception as error:
        logging.warning(
            f"Error parsing web_query output as JSON for {url}:\n{response.text}"
        )
        raise error
    if response.status_code == 300:
        # When single=1 is specified, multiple results should never be returned
        logging.warning(f"web_query returned multiple results for {url}:\n" +
                        json.dumps(zotero_data, indent=2))
        raise ValueError(f"multiple results for {url}")
    zotero_data = _passthrough_zotero_data(zotero_data)
    return zotero_data
Example #8
0
def search_query(identifier: str) -> ZoteroData:
    """
    Retrive Zotero metadata for a DOI, ISBN, PMID, or arXiv ID.
    Example usage:

    ```shell
    curl --silent \
      --data '10.2307/4486062' \
      --header 'Content-Type: text/plain' \
      http://127.0.0.1:1969/search
    ```
    """
    api_url = f"{base_url}/search"
    headers = {
        "User-Agent": get_manubot_user_agent(),
        "Content-Type": "text/plain"
    }
    response = requests.post(api_url, headers=headers, data=str(identifier))
    try:
        zotero_data = response.json()
    except Exception as error:
        logging.warning(
            f"Error parsing search_query output as JSON for {identifier}:\n{response.text}"
        )
        raise error
    zotero_data = _passthrough_zotero_data(zotero_data)
    return zotero_data
Example #9
0
def get_pubmed_csl_item(pmid: Union[str, int]) -> Dict[str, Any]:
    """
    Query NCBI E-Utilities to create CSL Items for PubMed IDs.

    https://github.com/manubot/manubot/issues/21
    https://github.com/ncbi/citation-exporter/issues/3#issuecomment-355313143
    """
    pmid = str(pmid)
    params = {"db": "pubmed", "id": pmid, "rettype": "full"}
    headers = {"User-Agent": get_manubot_user_agent()}
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    with _get_eutils_rate_limiter():
        response = requests.get(url, params, headers=headers)
    try:
        xml_article_set = ElementTree.fromstring(response.text)
        assert isinstance(xml_article_set, ElementTree.Element)
        assert xml_article_set.tag == "PubmedArticleSet"
        (xml_article,) = list(xml_article_set)
        assert xml_article.tag in ["PubmedArticle", "PubmedBookArticle"]
    except Exception as error:
        logging.error(
            f"Error fetching PubMed metadata for {pmid}.\n"
            f"Unsupported XML response from {response.url}:\n{response.text}"
        )
        raise error
    try:
        csl_item = csl_item_from_pubmed_article(xml_article)
    except Exception as error:
        msg = f"Error parsing the following PubMed metadata for PMID {pmid}:\n{response.text}"
        logging.error(msg)
        raise error
    return csl_item
Example #10
0
def _get_literature_citation_exporter_csl_item(database, identifier):
    """
    https://api.ncbi.nlm.nih.gov/lit/ctxp
    """
    if database not in {"pubmed", "pmc"}:
        logging.error(
            f"Error calling _get_literature_citation_exporter_csl_item.\n"
            f'database must be either "pubmed" or "pmc", not {database}')
        assert False
    if not identifier:
        logging.error(
            f"Error calling _get_literature_citation_exporter_csl_item.\n"
            f"identifier cannot be blank")
        assert False
    params = {"format": "csl", "id": identifier}
    headers = {"User-Agent": get_manubot_user_agent()}
    url = f"https://api.ncbi.nlm.nih.gov/lit/ctxp/v1/{database}/"
    response = requests.get(url, params, headers=headers)
    try:
        csl_item = response.json()
    except Exception as error:
        logging.error(
            f"Error fetching {database} metadata for {identifier}.\n"
            f"Invalid JSON response from {response.url}:\n{response.text}")
        raise error
    assert isinstance(csl_item, dict)
    if csl_item.get("status", "okay") == "error":
        logging.error(
            f"Error fetching {database} metadata for {identifier}.\n"
            f"Literature Citation Exporter returned JSON indicating an error for {response.url}\n"
            f"{json.dumps(csl_item, indent=2)}")
        assert False
    return csl_item
Example #11
0
def get_pmid_for_doi(doi: str) -> Optional[str]:
    """
    Query NCBI's E-utilities to retrieve the PMID for a DOI.
    """
    assert isinstance(doi, str)
    assert doi.startswith("10.")
    params = {"db": "pubmed", "term": f"{doi}[DOI]"}
    headers = {"User-Agent": get_manubot_user_agent()}
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    with _get_eutils_rate_limiter():
        response = requests.get(url, params, headers=headers)
    if not response.ok:
        logging.warning(f"Status code {response.status_code} querying {response.url}\n")
        return None
    try:
        element_tree = ElementTree.fromstring(response.text)
        assert isinstance(element_tree, ElementTree.Element)
        assert element_tree.tag == "eSearchResult"
    except Exception:
        logging.warning(
            f"Error in ESearch XML for DOI: {doi}.\n"
            f"Response from {response.url}:\n{response.text}"
        )
        return None
    id_elems = element_tree.findall("IdList/Id")
    if len(id_elems) != 1:
        logging.debug(
            f"No PMIDs found for {doi}.\n"
            f"Response from {response.url}:\n{response.text}"
        )
        return None
    (id_elem,) = id_elems
    return id_elem.text
Example #12
0
def get_isbn_csl_item_citoid(isbn: str):
    """
    Return CSL JSON Data for an ISBN using the Wikipedia Citoid API.
    https://en.wikipedia.org/api/rest_v1/#!/Citation/getCitation
    """
    import requests

    from manubot.util import get_manubot_user_agent

    headers = {"User-Agent": get_manubot_user_agent()}
    url = f"https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/{isbn}"
    response = requests.get(url, headers=headers)
    result = response.json()
    if isinstance(result, dict):
        if result["title"] == "Not found.":
            raise KeyError(f"Metadata for ISBN {isbn} not found at {url}")
        else:
            raise Exception(
                f"Unable to extract CSL from JSON metadata for ISBN {isbn}:\n"
                f"{json.dumps(result.text)}")
    (mediawiki, ) = result
    csl_item = dict()
    csl_item["type"] = mediawiki.get("itemType", "book")
    if "title" in mediawiki:
        csl_item["title"] = mediawiki["title"]
    if "author" in mediawiki:
        csl_author = list()
        for last, first in mediawiki["author"]:
            csl_author.append({"given": first, "family": last})
        if csl_author:
            csl_item["author"] = csl_author
    if "date" in mediawiki:
        year_pattern = re.compile(r"[0-9]{4}")
        match = year_pattern.search(mediawiki["date"])
        if match:
            year = int(match.group())
            csl_item["issued"] = {"date-parts": [[year]]}
        else:
            logging.debug(
                f"get_isbn_csl_item_citoid: issue extracting date for ISBN {isbn}\n"
                f"metadata retrieved from {url}\n"
                f'unable to extract year from date field: {mediawiki["date"]}')
    if "publisher" in mediawiki:
        csl_item["publisher"] = mediawiki["publisher"]
    if "place" in mediawiki:
        csl_item["publisher-place"] = mediawiki["place"]
    if "volume" in mediawiki:
        csl_item["volume"] = mediawiki["volume"]
    if "edition" in mediawiki:
        csl_item["edition"] = mediawiki["edition"]
    if "abstractNote" in mediawiki:
        csl_item["abstract"] = mediawiki["abstractNote"]
    csl_item["ISBN"] = isbn
    if "source" in mediawiki:
        csl_item["source"] = mediawiki["source"][0]
    if "url" in mediawiki:
        csl_item["URL"] = mediawiki["url"]
    return csl_item
Example #13
0
def get_short_doi_url(doi: str) -> Optional[str]:
    """
    Get the shortDOI URL for a DOI.
    """
    quoted_doi = urllib.request.quote(doi)
    url = "http://shortdoi.org/{}?format=json".format(quoted_doi)
    headers = {"User-Agent": get_manubot_user_agent()}
    try:
        response = requests.get(url, headers=headers).json()
        short_doi = response["ShortDOI"]
        short_url = "https://doi.org/" + short_doi[3:]  # Remove "10/" prefix
        return short_url
    except Exception:
        logging.warning(f"shortDOI lookup failed for {doi}", exc_info=True)
        return None
Example #14
0
def search_query(identifier):
    """
    Supports DOI, ISBN, PMID, arXiv ID.
    curl -d 10.2307/4486062 -H 'Content-Type: text/plain' http://127.0.0.1:1969/search
    """
    api_url = f"{base_url}/search"
    headers = {"User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain"}
    response = requests.post(api_url, headers=headers, data=str(identifier))
    try:
        zotero_data = response.json()
    except Exception as error:
        logging.warning(
            f"Error parsing search_query output as JSON for {identifier}:\n{response.text}"
        )
        raise error
    zotero_data = _passthrough_zotero_data(zotero_data)
    return zotero_data
Example #15
0
def get_short_doi_url(doi):
    """
    Get the shortDOI URL for a DOI.
    """
    quoted_doi = urllib.request.quote(doi)
    url = 'http://shortdoi.org/{}?format=json'.format(quoted_doi)
    headers = {
        'User-Agent': get_manubot_user_agent(),
    }
    try:
        response = requests.get(url, headers=headers).json()
        short_doi = response['ShortDOI']
        short_url = 'https://doi.org/' + short_doi[3:]  # Remove "10/" prefix
        return short_url
    except Exception:
        logging.warning(f'shortDOI lookup failed for {doi}', exc_info=True)
        return None
Example #16
0
def get_doi_csl_item_crosscite(doi):
    """
    Use Content Negotioation (https://crosscite.org/docs.html) to
    retrieve the CSL Item metadata for a DOI.
    """
    url = 'https://doi.org/' + urllib.request.quote(doi)
    header = {
        'Accept': 'application/vnd.citationstyles.csl+json',
        'User-Agent': get_manubot_user_agent(),
    }
    response = requests.get(url, headers=header)
    try:
        return response.json()
    except Exception as error:
        logging.error(f'Error fetching metadata for doi:{doi}.\n'
                      f'Invalid response from {response.url}:\n{response.text}')
        raise error
Example #17
0
def get_doi_csl_item_crosscite(doi: str):
    """
    Use Content Negotioation to retrieve the CSL Item
    metadata for a DOI.
    """
    url = urllib.parse.urljoin(content_negotiation_url,
                               urllib.request.quote(doi))
    header = {
        "Accept": "application/vnd.citationstyles.csl+json",
        "User-Agent": get_manubot_user_agent(),
    }
    response = requests.get(url, headers=header)
    try:
        return response.json()
    except Exception as error:
        logging.error(
            f"Error fetching metadata for doi:{doi}.\n"
            f"Invalid response from {response.url}:\n{response.text}")
        raise error
Example #18
0
def _get_literature_citation_exporter_csl_item(database, identifier):
    """
    https://api.ncbi.nlm.nih.gov/lit/ctxp
    """
    if database not in {'pubmed', 'pmc'}:
        logging.error(
            f'Error calling _get_literature_citation_exporter_csl_item.\n'
            f'database must be either "pubmed" or "pmc", not {database}')
        assert False
    if not identifier:
        logging.error(
            f'Error calling _get_literature_citation_exporter_csl_item.\n'
            f'identifier cannot be blank')
        assert False
    params = {
        'format': 'csl',
        'id': identifier,
    }
    headers = {
        'User-Agent': get_manubot_user_agent(),
    }
    url = f'https://api.ncbi.nlm.nih.gov/lit/ctxp/v1/{database}/'
    response = requests.get(url, params, headers=headers)
    try:
        csl_item = response.json()
    except Exception as error:
        logging.error(
            f'Error fetching {database} metadata for {identifier}.\n'
            f'Invalid JSON response from {response.url}:\n{response.text}')
        raise error
    assert isinstance(csl_item, dict)
    if csl_item.get('status', 'okay') == 'error':
        logging.error(
            f'Error fetching {database} metadata for {identifier}.\n'
            f'Literature Citation Exporter returned JSON indicating an error for {response.url}\n'
            f'{json.dumps(csl_item, indent=2)}')
        assert False
    return csl_item
Example #19
0
def query_arxiv_api(url, params):
    headers = {"User-Agent": get_manubot_user_agent()}
    response = requests.get(url, params, headers=headers)
    xml_tree = xml.etree.ElementTree.fromstring(response.text)
    return xml_tree
Example #20
0
def get_isbn_csl_item_citoid(isbn):
    """
    Return CSL JSON Data for an ISBN using the Wikipedia Citoid API.
    https://en.wikipedia.org/api/rest_v1/#!/Citation/getCitation
    """
    import requests
    from manubot.util import get_manubot_user_agent
    headers = {
        'User-Agent': get_manubot_user_agent(),
    }
    url = f'https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/{isbn}'
    response = requests.get(url, headers=headers)
    result = response.json()
    if isinstance(result, dict):
        if result['title'] == 'Not found.':
            raise KeyError(f'Metadata for ISBN {isbn} not found at {url}')
        else:
            raise Exception(
                f'Unable to extract CSL from JSON metadata for ISBN {isbn}:\n'
                f'{json.dumps(result.text)}')
    mediawiki, = result
    csl_item = collections.OrderedDict()
    csl_item['type'] = mediawiki.get('itemType', 'book')
    if 'title' in mediawiki:
        csl_item['title'] = mediawiki['title']
    if 'author' in mediawiki:
        csl_author = list()
        for last, first in mediawiki['author']:
            csl_author.append({
                'given': first,
                'family': last,
            })
        if csl_author:
            csl_item['author'] = csl_author
    if 'date' in mediawiki:
        year_pattern = re.compile(r'[0-9]{4}')
        match = year_pattern.search(mediawiki['date'])
        if match:
            year = int(match.group())
            csl_item['issued'] = {'date-parts': [[year]]}
        else:
            logging.debug(
                f'get_isbn_csl_item_citoid: issue extracting date for ISBN {isbn}\n'
                f'metadata retrieved from {url}\n'
                f'unable to extract year from date field: {mediawiki["date"]}')
    if 'publisher' in mediawiki:
        csl_item['publisher'] = mediawiki['publisher']
    if 'place' in mediawiki:
        csl_item['publisher-place'] = mediawiki['place']
    if 'volume' in mediawiki:
        csl_item['volume'] = mediawiki['volume']
    if 'edition' in mediawiki:
        csl_item['edition'] = mediawiki['edition']
    if 'abstractNote' in mediawiki:
        csl_item['abstract'] = mediawiki['abstractNote']
    csl_item['ISBN'] = isbn
    if 'source' in mediawiki:
        csl_item['source'] = mediawiki['source'][0]
    if 'url' in mediawiki:
        csl_item['URL'] = mediawiki['url']
    return csl_item
Example #21
0
def get_arxiv_csl_item(arxiv_id):
    """
    Return csl_item item for an arXiv record.

    arxiv_id can be versioned, like `1512.00567v2`, or versionless, like
    `1512.00567`. If versionless, the arXiv API will return metadata for the
    latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also
    supported.

    If arXiv has an associated DOI for the record, a warning is logged to
    alert the user that an alternative version of record exists.

    References:
    https://arxiv.org/help/api/index
    http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html
    https://github.com/citation-style-language/schema/blob/master/csl-data.json
    """
    url = "https://export.arxiv.org/api/query"
    params = {"id_list": arxiv_id, "max_results": 1}
    headers = {"User-Agent": get_manubot_user_agent()}
    response = requests.get(url, params, headers=headers)

    # XML namespace prefixes
    prefix = "{http://www.w3.org/2005/Atom}"
    alt_prefix = "{http://arxiv.org/schemas/atom}"

    # Parse XML
    xml_tree = xml.etree.ElementTree.fromstring(response.text)
    (entry, ) = xml_tree.findall(prefix + "entry")

    # Create dictionary for CSL Item
    csl_item = collections.OrderedDict()

    # Extract versioned arXiv ID
    url = entry.findtext(prefix + "id")
    pattern = re.compile(r"arxiv.org/abs/(.+)")
    match = pattern.search(url)
    versioned_id = match.group(1)
    csl_item["number"] = versioned_id
    _, csl_item["version"] = versioned_id.rsplit("v", 1)
    csl_item["URL"] = "https://arxiv.org/abs/" + versioned_id

    # Extrat CSL title field
    csl_item["title"] = entry.findtext(prefix + "title")

    # Extract CSL date field
    published = entry.findtext(prefix + "published")
    published, _ = published.split("T", 1)
    csl_item["issued"] = {
        "date-parts": [[int(x) for x in published.split("-")]]
    }

    # Extract authors
    authors = list()
    for elem in entry.findall(prefix + "author"):
        name = elem.findtext(prefix + "name")
        author = {"literal": name}
        authors.append(author)
    csl_item["author"] = authors

    # Set publisher to arXiv
    csl_item["container-title"] = "arXiv"
    csl_item["publisher"] = "arXiv"

    # Extract abstract
    abstract = entry.findtext(prefix + "summary").strip()
    if abstract:
        csl_item["abstract"] = abstract

    # Check if the article has been published with a DOI
    DOI = entry.findtext("{http://arxiv.org/schemas/atom}doi")
    if DOI:
        csl_item["DOI"] = DOI
        journal_ref = entry.findtext(alt_prefix + "journal_ref")
        msg = f"arXiv article {arxiv_id} published at https://doi.org/{DOI}"
        if journal_ref:
            msg += f" — {journal_ref}"
        logging.warning(msg)
    # Set CSL type to report for preprint
    csl_item["type"] = "report"
    return csl_item
Example #22
0
def get_arxiv_csl_item(arxiv_id):
    """
    Return csl_item item for an arXiv record.

    arxiv_id can be versioned, like `1512.00567v2`, or versionless, like
    `1512.00567`. If versionless, the arXiv API will return metadata for the
    latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also
    supported.

    If arXiv has an associated DOI for the record, a warning is logged to
    alert the user that an alternative version of record exists.

    References:
    https://arxiv.org/help/api/index
    http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html
    https://github.com/citation-style-language/schema/blob/master/csl-data.json
    """
    url = 'https://export.arxiv.org/api/query'
    params = {
        'id_list': arxiv_id,
        'max_results': 1,
    }
    headers = {
        'User-Agent': get_manubot_user_agent(),
    }
    response = requests.get(url, params, headers=headers)

    # XML namespace prefixes
    prefix = '{http://www.w3.org/2005/Atom}'
    alt_prefix = '{http://arxiv.org/schemas/atom}'

    # Parse XML
    xml_tree = xml.etree.ElementTree.fromstring(response.text)
    entry, = xml_tree.findall(prefix + 'entry')

    # Create dictionary for CSL Item
    csl_item = collections.OrderedDict()

    # Extract versioned arXiv ID
    url = entry.findtext(prefix + 'id')
    pattern = re.compile(r'arxiv.org/abs/(.+)')
    match = pattern.search(url)
    versioned_id = match.group(1)
    csl_item['number'] = versioned_id
    _, csl_item['version'] = versioned_id.rsplit('v', 1)
    csl_item['URL'] = 'https://arxiv.org/abs/' + versioned_id

    # Extrat CSL title field
    csl_item['title'] = entry.findtext(prefix + 'title')

    # Extract CSL date field
    published = entry.findtext(prefix + 'published')
    published, _ = published.split('T', 1)
    csl_item['issued'] = {
        'date-parts': [[int(x) for x in published.split('-')]]
    }

    # Extract authors
    authors = list()
    for elem in entry.findall(prefix + 'author'):
        name = elem.findtext(prefix + 'name')
        author = {'literal': name}
        authors.append(author)
    csl_item['author'] = authors

    # Set publisher to arXiv
    csl_item['container-title'] = 'arXiv'
    csl_item['publisher'] = 'arXiv'

    # Extract abstract
    abstract = entry.findtext(prefix + 'summary').strip()
    if abstract:
        csl_item['abstract'] = abstract

    # Check if the article has been published with a DOI
    DOI = entry.findtext('{http://arxiv.org/schemas/atom}doi')
    if DOI:
        csl_item['DOI'] = DOI
        journal_ref = entry.findtext(alt_prefix + 'journal_ref')
        msg = f'arXiv article {arxiv_id} published at https://doi.org/{DOI}'
        if journal_ref:
            msg += f' — {journal_ref}'
        logging.warning(msg)
    # Set CSL type to report for preprint
    csl_item['type'] = 'report'
    return csl_item