def get_url_csl_item_greycite(url: str) -> CSLItem: """ Uses Greycite which has experiened uptime problems in the past. API calls seem to take at least 15 seconds. Browser requests are much faster. Setting header did not have an effect. Consider mimicking browser using selenium. More information on Greycite at: http://greycite.knowledgeblog.org/ http://knowledgeblog.org/greycite https://arxiv.org/abs/1304.7151 https://git.io/v9N2C """ import requests from manubot.util import get_manubot_user_agent headers = { "Connection": "close", # https://github.com/kennethreitz/requests/issues/4023 "User-Agent": get_manubot_user_agent(), } response = requests.get("http://greycite.knowledgeblog.org/json", params={"uri": url}, headers=headers) # Some Greycite responses were valid JSON besides for an error appended # like "<p>*** Date set from uri<p>" or "<p>*** fetch error : 404<p>". pattern = re.compile(r"<p>\*\*\*.*<p>") text = pattern.sub("", response.text) csl_item = json.loads(text) csl_item["type"] = "webpage" return csl_item
def get_pubmed_csl_item(pmid): """ Query NCBI E-Utilities to create CSL Items for PubMed IDs. https://github.com/manubot/manubot/issues/21 https://github.com/ncbi/citation-exporter/issues/3#issuecomment-355313143 """ pmid = str(pmid) params = {"db": "pubmed", "id": pmid, "rettype": "full"} headers = {"User-Agent": get_manubot_user_agent()} url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" with _get_eutils_rate_limiter(): response = requests.get(url, params, headers=headers) try: element_tree = xml.etree.ElementTree.fromstring(response.text) (element_tree, ) = list(element_tree) except Exception as error: logging.error( f"Error fetching PubMed metadata for {pmid}.\n" f"Invalid XML response from {response.url}:\n{response.text}") raise error try: csl_item = csl_item_from_pubmed_article(element_tree) except Exception as error: msg = f"Error parsing the following PubMed metadata for PMID {pmid}:\n{response.text}" logging.error(msg) raise error return csl_item
def get_pubmed_citeproc(pmid): """ Query NCBI E-Utilities to create CSL Items for PubMed IDs. https://github.com/greenelab/manubot/issues/21 https://github.com/ncbi/citation-exporter/issues/3#issuecomment-355313143 """ pmid = str(pmid) params = { 'db': 'pubmed', 'id': pmid, 'rettype': 'full', } headers = { 'User-Agent': get_manubot_user_agent(), } url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' with _get_eutils_rate_limiter(): response = requests.get(url, params, headers=headers) try: element_tree = xml.etree.ElementTree.fromstring(response.text) element_tree, = list(element_tree) except Exception as error: logging.error( f'Error fetching PubMed metadata for {pmid}.\n' f'Invalid XML response from {response.url}:\n{response.text}') raise error try: citeproc = citeproc_from_pubmed_article(element_tree) except Exception as error: msg = f'Error parsing the following PubMed metadata for PMID {pmid}:\n{response.text}' logging.error(msg) raise error return citeproc
def get_pmid_for_doi(doi): """ Query NCBI's E-utilities to retrieve the PMID for a DOI. """ assert isinstance(doi, str) assert doi.startswith('10.') params = { 'db': 'pubmed', 'term': f'{doi}[DOI]', } headers = { 'User-Agent': get_manubot_user_agent(), } url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' with _get_eutils_rate_limiter(): response = requests.get(url, params, headers=headers) if not response.ok: logging.warning( f'Status code {response.status_code} querying {response.url}\n') return None try: element_tree = xml.etree.ElementTree.fromstring(response.text) except Exception as error: logging.warning(f'Error in ESearch XML for DOI: {doi}.\n' f'Response from {response.url}:\n{response.text}') return None id_elems = element_tree.findall('IdList/Id') if len(id_elems) != 1: logging.debug(f'No PMIDs found for {doi}.\n' f'Response from {response.url}:\n{response.text}') return None id_elem, = id_elems return id_elem.text
def export_as_csl(zotero_data: ZoteroData) -> CSLItems: """ Export Zotero JSON data to CSL JSON using a translation-server /export query. Performs a similar query to the following curl command: ``` curl --verbose \ --data @items.json \ --header 'Content-Type: application/json' \ 'https://translate.manubot.org/export?format=csljson' ``` """ api_url = f"{base_url}/export" params = {"format": "csljson"} headers = {"User-Agent": get_manubot_user_agent()} response = requests.post(api_url, params=params, headers=headers, json=zotero_data) if not response.ok: message = f"export_as_csl: translation-server returned status code {response.status_code}" logging.warning( f"{message} with the following output:\n{response.text}") raise requests.HTTPError(message) try: csl_items = response.json() except Exception as error: logging.warning( f"Error parsing export_as_csl output as JSON:\n{response.text}") raise error return csl_items
def get_doi_citeproc(doi): """ Use Content Negotioation (http://citation.crosscite.org/docs.html) to retrieve the citeproc JSON citation for a DOI. """ url = 'https://doi.org/' + urllib.request.quote(doi) header = { 'Accept': 'application/vnd.citationstyles.csl+json', 'User-Agent': get_manubot_user_agent(), } response = requests.get(url, headers=header) try: citeproc = response.json() except Exception as error: logging.error( f'Error fetching metadata for doi:{doi}.\n' f'Invalid response from {response.url}:\n{response.text}') raise error citeproc['URL'] = f'https://doi.org/{doi}' short_doi_url = get_short_doi_url(doi) if short_doi_url: citeproc['URL'] = short_doi_url try: citeproc.update(get_pubmed_ids_for_doi(doi)) except Exception: logging.warning(f'Error calling get_pubmed_ids_for_doi for {doi}', exc_info=True) return citeproc
def web_query(url: str) -> ZoteroData: """ Return Zotero citation metadata for a URL as a list containing a single element that is a dictionary with the URL's metadata. """ headers = { "User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain" } params = {"single": 1} api_url = f"{base_url}/web" response = requests.post(api_url, params=params, headers=headers, data=str(url)) try: zotero_data = response.json() except Exception as error: logging.warning( f"Error parsing web_query output as JSON for {url}:\n{response.text}" ) raise error if response.status_code == 300: # When single=1 is specified, multiple results should never be returned logging.warning(f"web_query returned multiple results for {url}:\n" + json.dumps(zotero_data, indent=2)) raise ValueError(f"multiple results for {url}") zotero_data = _passthrough_zotero_data(zotero_data) return zotero_data
def search_query(identifier: str) -> ZoteroData: """ Retrive Zotero metadata for a DOI, ISBN, PMID, or arXiv ID. Example usage: ```shell curl --silent \ --data '10.2307/4486062' \ --header 'Content-Type: text/plain' \ http://127.0.0.1:1969/search ``` """ api_url = f"{base_url}/search" headers = { "User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain" } response = requests.post(api_url, headers=headers, data=str(identifier)) try: zotero_data = response.json() except Exception as error: logging.warning( f"Error parsing search_query output as JSON for {identifier}:\n{response.text}" ) raise error zotero_data = _passthrough_zotero_data(zotero_data) return zotero_data
def get_pubmed_csl_item(pmid: Union[str, int]) -> Dict[str, Any]: """ Query NCBI E-Utilities to create CSL Items for PubMed IDs. https://github.com/manubot/manubot/issues/21 https://github.com/ncbi/citation-exporter/issues/3#issuecomment-355313143 """ pmid = str(pmid) params = {"db": "pubmed", "id": pmid, "rettype": "full"} headers = {"User-Agent": get_manubot_user_agent()} url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" with _get_eutils_rate_limiter(): response = requests.get(url, params, headers=headers) try: xml_article_set = ElementTree.fromstring(response.text) assert isinstance(xml_article_set, ElementTree.Element) assert xml_article_set.tag == "PubmedArticleSet" (xml_article,) = list(xml_article_set) assert xml_article.tag in ["PubmedArticle", "PubmedBookArticle"] except Exception as error: logging.error( f"Error fetching PubMed metadata for {pmid}.\n" f"Unsupported XML response from {response.url}:\n{response.text}" ) raise error try: csl_item = csl_item_from_pubmed_article(xml_article) except Exception as error: msg = f"Error parsing the following PubMed metadata for PMID {pmid}:\n{response.text}" logging.error(msg) raise error return csl_item
def _get_literature_citation_exporter_csl_item(database, identifier): """ https://api.ncbi.nlm.nih.gov/lit/ctxp """ if database not in {"pubmed", "pmc"}: logging.error( f"Error calling _get_literature_citation_exporter_csl_item.\n" f'database must be either "pubmed" or "pmc", not {database}') assert False if not identifier: logging.error( f"Error calling _get_literature_citation_exporter_csl_item.\n" f"identifier cannot be blank") assert False params = {"format": "csl", "id": identifier} headers = {"User-Agent": get_manubot_user_agent()} url = f"https://api.ncbi.nlm.nih.gov/lit/ctxp/v1/{database}/" response = requests.get(url, params, headers=headers) try: csl_item = response.json() except Exception as error: logging.error( f"Error fetching {database} metadata for {identifier}.\n" f"Invalid JSON response from {response.url}:\n{response.text}") raise error assert isinstance(csl_item, dict) if csl_item.get("status", "okay") == "error": logging.error( f"Error fetching {database} metadata for {identifier}.\n" f"Literature Citation Exporter returned JSON indicating an error for {response.url}\n" f"{json.dumps(csl_item, indent=2)}") assert False return csl_item
def get_pmid_for_doi(doi: str) -> Optional[str]: """ Query NCBI's E-utilities to retrieve the PMID for a DOI. """ assert isinstance(doi, str) assert doi.startswith("10.") params = {"db": "pubmed", "term": f"{doi}[DOI]"} headers = {"User-Agent": get_manubot_user_agent()} url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" with _get_eutils_rate_limiter(): response = requests.get(url, params, headers=headers) if not response.ok: logging.warning(f"Status code {response.status_code} querying {response.url}\n") return None try: element_tree = ElementTree.fromstring(response.text) assert isinstance(element_tree, ElementTree.Element) assert element_tree.tag == "eSearchResult" except Exception: logging.warning( f"Error in ESearch XML for DOI: {doi}.\n" f"Response from {response.url}:\n{response.text}" ) return None id_elems = element_tree.findall("IdList/Id") if len(id_elems) != 1: logging.debug( f"No PMIDs found for {doi}.\n" f"Response from {response.url}:\n{response.text}" ) return None (id_elem,) = id_elems return id_elem.text
def get_isbn_csl_item_citoid(isbn: str): """ Return CSL JSON Data for an ISBN using the Wikipedia Citoid API. https://en.wikipedia.org/api/rest_v1/#!/Citation/getCitation """ import requests from manubot.util import get_manubot_user_agent headers = {"User-Agent": get_manubot_user_agent()} url = f"https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/{isbn}" response = requests.get(url, headers=headers) result = response.json() if isinstance(result, dict): if result["title"] == "Not found.": raise KeyError(f"Metadata for ISBN {isbn} not found at {url}") else: raise Exception( f"Unable to extract CSL from JSON metadata for ISBN {isbn}:\n" f"{json.dumps(result.text)}") (mediawiki, ) = result csl_item = dict() csl_item["type"] = mediawiki.get("itemType", "book") if "title" in mediawiki: csl_item["title"] = mediawiki["title"] if "author" in mediawiki: csl_author = list() for last, first in mediawiki["author"]: csl_author.append({"given": first, "family": last}) if csl_author: csl_item["author"] = csl_author if "date" in mediawiki: year_pattern = re.compile(r"[0-9]{4}") match = year_pattern.search(mediawiki["date"]) if match: year = int(match.group()) csl_item["issued"] = {"date-parts": [[year]]} else: logging.debug( f"get_isbn_csl_item_citoid: issue extracting date for ISBN {isbn}\n" f"metadata retrieved from {url}\n" f'unable to extract year from date field: {mediawiki["date"]}') if "publisher" in mediawiki: csl_item["publisher"] = mediawiki["publisher"] if "place" in mediawiki: csl_item["publisher-place"] = mediawiki["place"] if "volume" in mediawiki: csl_item["volume"] = mediawiki["volume"] if "edition" in mediawiki: csl_item["edition"] = mediawiki["edition"] if "abstractNote" in mediawiki: csl_item["abstract"] = mediawiki["abstractNote"] csl_item["ISBN"] = isbn if "source" in mediawiki: csl_item["source"] = mediawiki["source"][0] if "url" in mediawiki: csl_item["URL"] = mediawiki["url"] return csl_item
def get_short_doi_url(doi: str) -> Optional[str]: """ Get the shortDOI URL for a DOI. """ quoted_doi = urllib.request.quote(doi) url = "http://shortdoi.org/{}?format=json".format(quoted_doi) headers = {"User-Agent": get_manubot_user_agent()} try: response = requests.get(url, headers=headers).json() short_doi = response["ShortDOI"] short_url = "https://doi.org/" + short_doi[3:] # Remove "10/" prefix return short_url except Exception: logging.warning(f"shortDOI lookup failed for {doi}", exc_info=True) return None
def search_query(identifier): """ Supports DOI, ISBN, PMID, arXiv ID. curl -d 10.2307/4486062 -H 'Content-Type: text/plain' http://127.0.0.1:1969/search """ api_url = f"{base_url}/search" headers = {"User-Agent": get_manubot_user_agent(), "Content-Type": "text/plain"} response = requests.post(api_url, headers=headers, data=str(identifier)) try: zotero_data = response.json() except Exception as error: logging.warning( f"Error parsing search_query output as JSON for {identifier}:\n{response.text}" ) raise error zotero_data = _passthrough_zotero_data(zotero_data) return zotero_data
def get_short_doi_url(doi): """ Get the shortDOI URL for a DOI. """ quoted_doi = urllib.request.quote(doi) url = 'http://shortdoi.org/{}?format=json'.format(quoted_doi) headers = { 'User-Agent': get_manubot_user_agent(), } try: response = requests.get(url, headers=headers).json() short_doi = response['ShortDOI'] short_url = 'https://doi.org/' + short_doi[3:] # Remove "10/" prefix return short_url except Exception: logging.warning(f'shortDOI lookup failed for {doi}', exc_info=True) return None
def get_doi_csl_item_crosscite(doi): """ Use Content Negotioation (https://crosscite.org/docs.html) to retrieve the CSL Item metadata for a DOI. """ url = 'https://doi.org/' + urllib.request.quote(doi) header = { 'Accept': 'application/vnd.citationstyles.csl+json', 'User-Agent': get_manubot_user_agent(), } response = requests.get(url, headers=header) try: return response.json() except Exception as error: logging.error(f'Error fetching metadata for doi:{doi}.\n' f'Invalid response from {response.url}:\n{response.text}') raise error
def get_doi_csl_item_crosscite(doi: str): """ Use Content Negotioation to retrieve the CSL Item metadata for a DOI. """ url = urllib.parse.urljoin(content_negotiation_url, urllib.request.quote(doi)) header = { "Accept": "application/vnd.citationstyles.csl+json", "User-Agent": get_manubot_user_agent(), } response = requests.get(url, headers=header) try: return response.json() except Exception as error: logging.error( f"Error fetching metadata for doi:{doi}.\n" f"Invalid response from {response.url}:\n{response.text}") raise error
def _get_literature_citation_exporter_csl_item(database, identifier): """ https://api.ncbi.nlm.nih.gov/lit/ctxp """ if database not in {'pubmed', 'pmc'}: logging.error( f'Error calling _get_literature_citation_exporter_csl_item.\n' f'database must be either "pubmed" or "pmc", not {database}') assert False if not identifier: logging.error( f'Error calling _get_literature_citation_exporter_csl_item.\n' f'identifier cannot be blank') assert False params = { 'format': 'csl', 'id': identifier, } headers = { 'User-Agent': get_manubot_user_agent(), } url = f'https://api.ncbi.nlm.nih.gov/lit/ctxp/v1/{database}/' response = requests.get(url, params, headers=headers) try: csl_item = response.json() except Exception as error: logging.error( f'Error fetching {database} metadata for {identifier}.\n' f'Invalid JSON response from {response.url}:\n{response.text}') raise error assert isinstance(csl_item, dict) if csl_item.get('status', 'okay') == 'error': logging.error( f'Error fetching {database} metadata for {identifier}.\n' f'Literature Citation Exporter returned JSON indicating an error for {response.url}\n' f'{json.dumps(csl_item, indent=2)}') assert False return csl_item
def query_arxiv_api(url, params): headers = {"User-Agent": get_manubot_user_agent()} response = requests.get(url, params, headers=headers) xml_tree = xml.etree.ElementTree.fromstring(response.text) return xml_tree
def get_isbn_csl_item_citoid(isbn): """ Return CSL JSON Data for an ISBN using the Wikipedia Citoid API. https://en.wikipedia.org/api/rest_v1/#!/Citation/getCitation """ import requests from manubot.util import get_manubot_user_agent headers = { 'User-Agent': get_manubot_user_agent(), } url = f'https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/{isbn}' response = requests.get(url, headers=headers) result = response.json() if isinstance(result, dict): if result['title'] == 'Not found.': raise KeyError(f'Metadata for ISBN {isbn} not found at {url}') else: raise Exception( f'Unable to extract CSL from JSON metadata for ISBN {isbn}:\n' f'{json.dumps(result.text)}') mediawiki, = result csl_item = collections.OrderedDict() csl_item['type'] = mediawiki.get('itemType', 'book') if 'title' in mediawiki: csl_item['title'] = mediawiki['title'] if 'author' in mediawiki: csl_author = list() for last, first in mediawiki['author']: csl_author.append({ 'given': first, 'family': last, }) if csl_author: csl_item['author'] = csl_author if 'date' in mediawiki: year_pattern = re.compile(r'[0-9]{4}') match = year_pattern.search(mediawiki['date']) if match: year = int(match.group()) csl_item['issued'] = {'date-parts': [[year]]} else: logging.debug( f'get_isbn_csl_item_citoid: issue extracting date for ISBN {isbn}\n' f'metadata retrieved from {url}\n' f'unable to extract year from date field: {mediawiki["date"]}') if 'publisher' in mediawiki: csl_item['publisher'] = mediawiki['publisher'] if 'place' in mediawiki: csl_item['publisher-place'] = mediawiki['place'] if 'volume' in mediawiki: csl_item['volume'] = mediawiki['volume'] if 'edition' in mediawiki: csl_item['edition'] = mediawiki['edition'] if 'abstractNote' in mediawiki: csl_item['abstract'] = mediawiki['abstractNote'] csl_item['ISBN'] = isbn if 'source' in mediawiki: csl_item['source'] = mediawiki['source'][0] if 'url' in mediawiki: csl_item['URL'] = mediawiki['url'] return csl_item
def get_arxiv_csl_item(arxiv_id): """ Return csl_item item for an arXiv record. arxiv_id can be versioned, like `1512.00567v2`, or versionless, like `1512.00567`. If versionless, the arXiv API will return metadata for the latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also supported. If arXiv has an associated DOI for the record, a warning is logged to alert the user that an alternative version of record exists. References: https://arxiv.org/help/api/index http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html https://github.com/citation-style-language/schema/blob/master/csl-data.json """ url = "https://export.arxiv.org/api/query" params = {"id_list": arxiv_id, "max_results": 1} headers = {"User-Agent": get_manubot_user_agent()} response = requests.get(url, params, headers=headers) # XML namespace prefixes prefix = "{http://www.w3.org/2005/Atom}" alt_prefix = "{http://arxiv.org/schemas/atom}" # Parse XML xml_tree = xml.etree.ElementTree.fromstring(response.text) (entry, ) = xml_tree.findall(prefix + "entry") # Create dictionary for CSL Item csl_item = collections.OrderedDict() # Extract versioned arXiv ID url = entry.findtext(prefix + "id") pattern = re.compile(r"arxiv.org/abs/(.+)") match = pattern.search(url) versioned_id = match.group(1) csl_item["number"] = versioned_id _, csl_item["version"] = versioned_id.rsplit("v", 1) csl_item["URL"] = "https://arxiv.org/abs/" + versioned_id # Extrat CSL title field csl_item["title"] = entry.findtext(prefix + "title") # Extract CSL date field published = entry.findtext(prefix + "published") published, _ = published.split("T", 1) csl_item["issued"] = { "date-parts": [[int(x) for x in published.split("-")]] } # Extract authors authors = list() for elem in entry.findall(prefix + "author"): name = elem.findtext(prefix + "name") author = {"literal": name} authors.append(author) csl_item["author"] = authors # Set publisher to arXiv csl_item["container-title"] = "arXiv" csl_item["publisher"] = "arXiv" # Extract abstract abstract = entry.findtext(prefix + "summary").strip() if abstract: csl_item["abstract"] = abstract # Check if the article has been published with a DOI DOI = entry.findtext("{http://arxiv.org/schemas/atom}doi") if DOI: csl_item["DOI"] = DOI journal_ref = entry.findtext(alt_prefix + "journal_ref") msg = f"arXiv article {arxiv_id} published at https://doi.org/{DOI}" if journal_ref: msg += f" — {journal_ref}" logging.warning(msg) # Set CSL type to report for preprint csl_item["type"] = "report" return csl_item
def get_arxiv_csl_item(arxiv_id): """ Return csl_item item for an arXiv record. arxiv_id can be versioned, like `1512.00567v2`, or versionless, like `1512.00567`. If versionless, the arXiv API will return metadata for the latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also supported. If arXiv has an associated DOI for the record, a warning is logged to alert the user that an alternative version of record exists. References: https://arxiv.org/help/api/index http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html https://github.com/citation-style-language/schema/blob/master/csl-data.json """ url = 'https://export.arxiv.org/api/query' params = { 'id_list': arxiv_id, 'max_results': 1, } headers = { 'User-Agent': get_manubot_user_agent(), } response = requests.get(url, params, headers=headers) # XML namespace prefixes prefix = '{http://www.w3.org/2005/Atom}' alt_prefix = '{http://arxiv.org/schemas/atom}' # Parse XML xml_tree = xml.etree.ElementTree.fromstring(response.text) entry, = xml_tree.findall(prefix + 'entry') # Create dictionary for CSL Item csl_item = collections.OrderedDict() # Extract versioned arXiv ID url = entry.findtext(prefix + 'id') pattern = re.compile(r'arxiv.org/abs/(.+)') match = pattern.search(url) versioned_id = match.group(1) csl_item['number'] = versioned_id _, csl_item['version'] = versioned_id.rsplit('v', 1) csl_item['URL'] = 'https://arxiv.org/abs/' + versioned_id # Extrat CSL title field csl_item['title'] = entry.findtext(prefix + 'title') # Extract CSL date field published = entry.findtext(prefix + 'published') published, _ = published.split('T', 1) csl_item['issued'] = { 'date-parts': [[int(x) for x in published.split('-')]] } # Extract authors authors = list() for elem in entry.findall(prefix + 'author'): name = elem.findtext(prefix + 'name') author = {'literal': name} authors.append(author) csl_item['author'] = authors # Set publisher to arXiv csl_item['container-title'] = 'arXiv' csl_item['publisher'] = 'arXiv' # Extract abstract abstract = entry.findtext(prefix + 'summary').strip() if abstract: csl_item['abstract'] = abstract # Check if the article has been published with a DOI DOI = entry.findtext('{http://arxiv.org/schemas/atom}doi') if DOI: csl_item['DOI'] = DOI journal_ref = entry.findtext(alt_prefix + 'journal_ref') msg = f'arXiv article {arxiv_id} published at https://doi.org/{DOI}' if journal_ref: msg += f' — {journal_ref}' logging.warning(msg) # Set CSL type to report for preprint csl_item['type'] = 'report' return csl_item