def citation_to_citeproc(*args, **kwargs): import warnings warnings.warn( "'citation_to_citeproc' has been renamed to 'citekey_to_csl_item'" " and will be removed in a future release.", category=FutureWarning) return citekey_to_csl_item(*args, **kwargs)
def generate_csl_items(args, citekeys_df): """ General CSL (citeproc) items for standard_citekeys in citekeys_df. Writes references.json to disk and logs warnings for potential problems. """ # Read manual references (overrides) in JSON CSL manual_refs = load_manual_references(args.manual_references_paths) requests_cache.install_cache(args.requests_cache_path, include_get_headers=True) cache = requests_cache.get_cache() if args.clear_requests_cache: logging.info('Clearing requests-cache') requests_cache.clear() logging.info( f'requests-cache starting with {len(cache.responses)} cached responses' ) csl_items = list() failures = list() for standard_citekey in citekeys_df.standard_citekey.unique(): if standard_citekey in manual_refs: csl_items.append(manual_refs[standard_citekey]) continue elif standard_citekey.startswith('raw:'): logging.error( f'CSL JSON Data with a standard_citekey of {standard_citekey!r} not found in manual-references.json. ' 'Metadata must be provided for raw citekeys.') failures.append(standard_citekey) try: csl_item = citekey_to_csl_item(standard_citekey) csl_items.append(csl_item) except Exception: logging.exception( f'Citeproc retrieval failure for {standard_citekey!r}') failures.append(standard_citekey) logging.info( f'requests-cache finished with {len(cache.responses)} cached responses' ) requests_cache.uninstall_cache() if failures: message = 'CSL JSON Data retrieval failed for the following standardized citation keys:\n{}'.format( '\n'.join(failures)) logging.error(message) # Write JSON CSL bibliography for Pandoc. with args.references_path.open('w', encoding='utf-8') as write_file: json.dump(csl_items, write_file, indent=2, ensure_ascii=False) write_file.write('\n') return csl_items
def cli_cite(args): """ Main function for the manubot cite command-line interface. Does not allow user to directly specify Pandoc's --to argument, due to inconsistent citaiton rendering by output format. See https://github.com/jgm/pandoc/issues/4834 """ # generate CSL JSON data csl_list = list() for citekey in args.citekeys: try: if not is_valid_citekey(citekey): continue citekey = standardize_citekey(citekey) csl_item = citekey_to_csl_item(citekey, prune=args.prune_csl) csl_list.append(csl_item) except Exception as error: logging.error(f'citekey_to_csl_item for {citekey!r} failed ' f'due to a {error.__class__.__name__}:\n{error}') logging.info(error, exc_info=True) # output CSL JSON data, if --render is False if not args.render: write_file = args.output.open( 'w', encoding='utf-8') if args.output else sys.stdout with write_file: json.dump(csl_list, write_file, ensure_ascii=False, indent=2) write_file.write('\n') return # use Pandoc to render references if not args.format and args.output: vars(args)['format'] = extension_to_format.get(args.output.suffix) if not args.format: vars(args)['format'] = 'plain' pandoc_metadata = { 'nocite': '@*', 'csl': args.csl, 'references': csl_list, } call_pandoc( metadata=pandoc_metadata, path=args.output, format=args.format, )
def process_record(record): """ Expand a catalog record with retrieved metadata """ output = {} html_url = record.pop('html_url') output['manubot'] = { 'repo_url': record.pop('repo_url'), 'url': html_url, 'citation': f"url:{html_url}", } if 'thumbnail_url' in record: thumbnail_url = record.pop('thumbnail_url') else: thumbnail_url = get_thumbnail_url_from_html(html_url) if thumbnail_url: output['manubot']['thumbnail_url'] = thumbnail_url for publication_type in 'preprint', 'journal': citation = record.pop(f'{publication_type}_citation', None) if not citation: continue citekey = CiteKey(citation) if not citekey.is_handled_prefix: logging.warning(f"unhandled citekey: {citation!r}") continue report = citekey.inspect() if report: logging.warning( f"citekey failed inspection: {citation!r}\n{report}") continue output[publication_type] = { 'citation': citekey.standard_id, } for item in output.values(): csl_item = citekey_to_csl_item(item['citation']) if 'url' not in item and 'URL' in csl_item: item['url'] = csl_item['URL'] item['title'] = get_title(csl_item) item['authors'] = get_authors_text(csl_item) item['journal'] = get_journal(csl_item) item['date_iso'] = get_date(csl_item) item['date_human'] = get_date_summary(csl_item) item['csl_item'] = csl_item output['extras'] = record return output
def get_csl_item(url) -> dict: """ Generate a CSL JSON item for a URL. Currently, does not work for most PDF URLs unless they are from known domains where persistent identifiers can be extracted. """ from manubot.cite.citekey import citekey_to_csl_item, url_to_citekey try: citekey = url_to_citekey(url) citekey = RHCiteKey(citekey) csl_item = citekey_to_csl_item(citekey) if not csl_item: raise Exception(f"Error searching for paper: {url}") return csl_item except Exception as e: raise ManubotProcessingError(e)
def get_csl_items(self) -> tp.List: """ Produce a list of CSL_Items. I.e. a references list / bibliography for `self.citekeys`. """ # dictionary of input_id to CSL_Item ID (i.e. short_id), # excludes standard_ids for which CSL Items could not be generated. self.input_to_csl_id = {} self.csl_items = [] groups = self.group_citekeys_by("standard_id") for _standard_id, citekeys in groups: csl_item = citekey_to_csl_item( citekey=citekeys[0], prune=self.prune_csl_items, log_level=self.csl_item_failure_log_level, manual_refs=self.manual_refs, ) if csl_item: for ck in citekeys: self.input_to_csl_id[ck.input_id] = csl_item["id"] self.csl_items.append(csl_item) return self.csl_items
def process_record(record): """ Expand a catalog record with retrieved metadata """ output = {} html_url = record.pop('html_url') output['manubot'] = { 'repo_url': record.pop('repo_url'), 'url': html_url, 'citation': f"url:{html_url}", } if 'thumbnail_url' in record: thumbnail_url = record.pop('thumbnail_url') else: thumbnail_url = get_thumbnail_url_from_html(html_url) if thumbnail_url: output['manubot']['thumbnail_url'] = thumbnail_url for publication_type in 'preprint', 'journal': citation = record.pop(f'{publication_type}_citation', None) if not citation: continue if not is_valid_citekey(citation): continue output[publication_type] = { 'citation': citation, } for item in output.values(): citation = standardize_citekey(item['citation']) csl_item = citekey_to_csl_item(citation) if 'url' not in item and 'URL' in csl_item: item['url'] = csl_item['URL'] item['title'] = get_title(csl_item) item['authors'] = get_authors_text(csl_item) item['journal'] = get_journal(csl_item) item['date_iso'] = get_date(csl_item) item['date_human'] = get_date_summary(csl_item) item['csl_item'] = csl_item output['extras'] = record return output
def generate_csl_items( citekeys: list, manual_refs: dict = {}, requests_cache_path: Optional[str] = None, clear_requests_cache: Optional[bool] = False, ) -> list: """ General CSL (citeproc) items for standard_citekeys in citekeys_df. Parameters: - citekeys: list of standard_citekeys - manual_refs: mapping from standard_citekey to csl_item for manual references - requests_cache_path: path for the requests cache database. Passed as cache_name to `requests_cache.install_cache`. requests_cache may append an extension to this path, so it is not always the exact path to the cache. If None, do not use requests_cache. - clear_requests_cache: If True, clear the requests cache before generating citekey metadata. """ # Deduplicate citations citekeys = list(dict.fromkeys(citekeys)) # Install cache if requests_cache_path is not None: requests # require `import requests` in case this is essential for monkey patching by requests_cache. requests_cache.install_cache(requests_cache_path, include_get_headers=True) cache = requests_cache.get_cache() if clear_requests_cache: logging.info("Clearing requests-cache") requests_cache.clear() logging.info( f"requests-cache starting with {len(cache.responses)} cached responses" ) csl_items = list() failures = list() for standard_citekey in citekeys: if standard_citekey in manual_refs: csl_items.append(manual_refs[standard_citekey]) continue elif standard_citekey.startswith("raw:"): logging.error( f"CSL JSON Data with a standard_citekey of {standard_citekey!r} not found in manual-references.json. " "Metadata must be provided for raw citekeys.") failures.append(standard_citekey) try: csl_item = citekey_to_csl_item(standard_citekey) csl_items.append(csl_item) except Exception: logging.exception( f"Citeproc retrieval failure for {standard_citekey!r}") failures.append(standard_citekey) # Uninstall cache if requests_cache_path is not None: logging.info( f"requests-cache finished with {len(cache.responses)} cached responses" ) requests_cache.uninstall_cache() if failures: message = "CSL JSON Data retrieval failed for the following standardized citation keys:\n{}".format( "\n".join(failures)) logging.error(message) return csl_items