Example #1
0
def citation_to_citeproc(*args, **kwargs):
    import warnings
    warnings.warn(
        "'citation_to_citeproc' has been renamed to 'citekey_to_csl_item'"
        " and will be removed in a future release.",
        category=FutureWarning)
    return citekey_to_csl_item(*args, **kwargs)
Example #2
0
def generate_csl_items(args, citekeys_df):
    """
    General CSL (citeproc) items for standard_citekeys in citekeys_df.
    Writes references.json to disk and logs warnings for potential problems.
    """
    # Read manual references (overrides) in JSON CSL
    manual_refs = load_manual_references(args.manual_references_paths)

    requests_cache.install_cache(args.requests_cache_path,
                                 include_get_headers=True)
    cache = requests_cache.get_cache()
    if args.clear_requests_cache:
        logging.info('Clearing requests-cache')
        requests_cache.clear()
    logging.info(
        f'requests-cache starting with {len(cache.responses)} cached responses'
    )

    csl_items = list()
    failures = list()
    for standard_citekey in citekeys_df.standard_citekey.unique():
        if standard_citekey in manual_refs:
            csl_items.append(manual_refs[standard_citekey])
            continue
        elif standard_citekey.startswith('raw:'):
            logging.error(
                f'CSL JSON Data with a standard_citekey of {standard_citekey!r} not found in manual-references.json. '
                'Metadata must be provided for raw citekeys.')
            failures.append(standard_citekey)
        try:
            csl_item = citekey_to_csl_item(standard_citekey)
            csl_items.append(csl_item)
        except Exception:
            logging.exception(
                f'Citeproc retrieval failure for {standard_citekey!r}')
            failures.append(standard_citekey)

    logging.info(
        f'requests-cache finished with {len(cache.responses)} cached responses'
    )
    requests_cache.uninstall_cache()

    if failures:
        message = 'CSL JSON Data retrieval failed for the following standardized citation keys:\n{}'.format(
            '\n'.join(failures))
        logging.error(message)

    # Write JSON CSL bibliography for Pandoc.
    with args.references_path.open('w', encoding='utf-8') as write_file:
        json.dump(csl_items, write_file, indent=2, ensure_ascii=False)
        write_file.write('\n')
    return csl_items
Example #3
0
def cli_cite(args):
    """
    Main function for the manubot cite command-line interface.

    Does not allow user to directly specify Pandoc's --to argument, due to
    inconsistent citaiton rendering by output format. See
    https://github.com/jgm/pandoc/issues/4834
    """
    # generate CSL JSON data
    csl_list = list()
    for citekey in args.citekeys:
        try:
            if not is_valid_citekey(citekey):
                continue
            citekey = standardize_citekey(citekey)
            csl_item = citekey_to_csl_item(citekey, prune=args.prune_csl)
            csl_list.append(csl_item)
        except Exception as error:
            logging.error(f'citekey_to_csl_item for {citekey!r} failed '
                          f'due to a {error.__class__.__name__}:\n{error}')
            logging.info(error, exc_info=True)

    # output CSL JSON data, if --render is False
    if not args.render:
        write_file = args.output.open(
            'w', encoding='utf-8') if args.output else sys.stdout
        with write_file:
            json.dump(csl_list, write_file, ensure_ascii=False, indent=2)
            write_file.write('\n')
        return

    # use Pandoc to render references
    if not args.format and args.output:
        vars(args)['format'] = extension_to_format.get(args.output.suffix)
    if not args.format:
        vars(args)['format'] = 'plain'
    pandoc_metadata = {
        'nocite': '@*',
        'csl': args.csl,
        'references': csl_list,
    }
    call_pandoc(
        metadata=pandoc_metadata,
        path=args.output,
        format=args.format,
    )
Example #4
0
def process_record(record):
    """
    Expand a catalog record with retrieved metadata
    """
    output = {}
    html_url = record.pop('html_url')
    output['manubot'] = {
        'repo_url': record.pop('repo_url'),
        'url': html_url,
        'citation': f"url:{html_url}",
    }
    if 'thumbnail_url' in record:
        thumbnail_url = record.pop('thumbnail_url')
    else:
        thumbnail_url = get_thumbnail_url_from_html(html_url)
    if thumbnail_url:
        output['manubot']['thumbnail_url'] = thumbnail_url
    for publication_type in 'preprint', 'journal':
        citation = record.pop(f'{publication_type}_citation', None)
        if not citation:
            continue
        citekey = CiteKey(citation)
        if not citekey.is_handled_prefix:
            logging.warning(f"unhandled citekey: {citation!r}")
            continue
        report = citekey.inspect()
        if report:
            logging.warning(
                f"citekey failed inspection: {citation!r}\n{report}")
            continue
        output[publication_type] = {
            'citation': citekey.standard_id,
        }
    for item in output.values():
        csl_item = citekey_to_csl_item(item['citation'])
        if 'url' not in item and 'URL' in csl_item:
            item['url'] = csl_item['URL']
        item['title'] = get_title(csl_item)
        item['authors'] = get_authors_text(csl_item)
        item['journal'] = get_journal(csl_item)
        item['date_iso'] = get_date(csl_item)
        item['date_human'] = get_date_summary(csl_item)
        item['csl_item'] = csl_item
    output['extras'] = record
    return output
Example #5
0
def get_csl_item(url) -> dict:
    """
    Generate a CSL JSON item for a URL. Currently, does not work
    for most PDF URLs unless they are from known domains where
    persistent identifiers can be extracted.
    """
    from manubot.cite.citekey import citekey_to_csl_item, url_to_citekey

    try:

        citekey = url_to_citekey(url)
        citekey = RHCiteKey(citekey)
        csl_item = citekey_to_csl_item(citekey)

        if not csl_item:
            raise Exception(f"Error searching for paper: {url}")
        return csl_item
    except Exception as e:
        raise ManubotProcessingError(e)
Example #6
0
 def get_csl_items(self) -> tp.List:
     """
     Produce a list of CSL_Items. I.e. a references list / bibliography
     for `self.citekeys`.
     """
     # dictionary of input_id to CSL_Item ID (i.e. short_id),
     # excludes standard_ids for which CSL Items could not be generated.
     self.input_to_csl_id = {}
     self.csl_items = []
     groups = self.group_citekeys_by("standard_id")
     for _standard_id, citekeys in groups:
         csl_item = citekey_to_csl_item(
             citekey=citekeys[0],
             prune=self.prune_csl_items,
             log_level=self.csl_item_failure_log_level,
             manual_refs=self.manual_refs,
         )
         if csl_item:
             for ck in citekeys:
                 self.input_to_csl_id[ck.input_id] = csl_item["id"]
             self.csl_items.append(csl_item)
     return self.csl_items
Example #7
0
def process_record(record):
    """
    Expand a catalog record with retrieved metadata
    """
    output = {}
    html_url = record.pop('html_url')
    output['manubot'] = {
        'repo_url': record.pop('repo_url'),
        'url': html_url,
        'citation': f"url:{html_url}",
    }
    if 'thumbnail_url' in record:
        thumbnail_url = record.pop('thumbnail_url')
    else:
        thumbnail_url = get_thumbnail_url_from_html(html_url)
    if thumbnail_url:
        output['manubot']['thumbnail_url'] = thumbnail_url
    for publication_type in 'preprint', 'journal':
        citation = record.pop(f'{publication_type}_citation', None)
        if not citation:
            continue
        if not is_valid_citekey(citation):
            continue
        output[publication_type] = {
            'citation': citation,
        }
    for item in output.values():
        citation = standardize_citekey(item['citation'])
        csl_item = citekey_to_csl_item(citation)
        if 'url' not in item and 'URL' in csl_item:
            item['url'] = csl_item['URL']
        item['title'] = get_title(csl_item)
        item['authors'] = get_authors_text(csl_item)
        item['journal'] = get_journal(csl_item)
        item['date_iso'] = get_date(csl_item)
        item['date_human'] = get_date_summary(csl_item)
        item['csl_item'] = csl_item
    output['extras'] = record
    return output
Example #8
0
def generate_csl_items(
    citekeys: list,
    manual_refs: dict = {},
    requests_cache_path: Optional[str] = None,
    clear_requests_cache: Optional[bool] = False,
) -> list:
    """
    General CSL (citeproc) items for standard_citekeys in citekeys_df.

    Parameters:

    - citekeys: list of standard_citekeys
    - manual_refs: mapping from standard_citekey to csl_item for manual references
    - requests_cache_path: path for the requests cache database.
      Passed as cache_name to `requests_cache.install_cache`.
      requests_cache may append an extension to this path, so it is not always the exact
      path to the cache. If None, do not use requests_cache.
    - clear_requests_cache: If True, clear the requests cache before generating citekey metadata.
    """
    # Deduplicate citations
    citekeys = list(dict.fromkeys(citekeys))

    # Install cache
    if requests_cache_path is not None:
        requests  # require `import requests` in case this is essential for monkey patching by requests_cache.
        requests_cache.install_cache(requests_cache_path,
                                     include_get_headers=True)
        cache = requests_cache.get_cache()
        if clear_requests_cache:
            logging.info("Clearing requests-cache")
            requests_cache.clear()
        logging.info(
            f"requests-cache starting with {len(cache.responses)} cached responses"
        )

    csl_items = list()
    failures = list()
    for standard_citekey in citekeys:
        if standard_citekey in manual_refs:
            csl_items.append(manual_refs[standard_citekey])
            continue
        elif standard_citekey.startswith("raw:"):
            logging.error(
                f"CSL JSON Data with a standard_citekey of {standard_citekey!r} not found in manual-references.json. "
                "Metadata must be provided for raw citekeys.")
            failures.append(standard_citekey)
        try:
            csl_item = citekey_to_csl_item(standard_citekey)
            csl_items.append(csl_item)
        except Exception:
            logging.exception(
                f"Citeproc retrieval failure for {standard_citekey!r}")
            failures.append(standard_citekey)

    # Uninstall cache
    if requests_cache_path is not None:
        logging.info(
            f"requests-cache finished with {len(cache.responses)} cached responses"
        )
        requests_cache.uninstall_cache()

    if failures:
        message = "CSL JSON Data retrieval failed for the following standardized citation keys:\n{}".format(
            "\n".join(failures))
        logging.error(message)

    return csl_items