def get_citekeys_df(citekeys: list, citekey_aliases: dict = {}): """ Generate and return citekeys_df. citekeys_df is a pandas.DataFrame with the following columns: - manuscript_citekey: citation keys extracted from the manuscript content files. - detagged_citekey: manuscript_citekey but with tag citekeys dereferenced - standard_citekey: detagged_citekey standardized - short_citekey: standard_citekey hashed to create a shortened citekey """ citekeys_df = pandas.DataFrame({ "manuscript_citekey": list(citekeys) }).drop_duplicates() citekeys_df["detagged_citekey"] = citekeys_df.manuscript_citekey.map( lambda citekey: citekey_aliases.get(citekey, citekey)) for citation in citekeys_df.detagged_citekey: is_valid_citekey(citation, allow_raw=True) citekeys_df["standard_citekey"] = citekeys_df.detagged_citekey.map( standardize_citekey) citekeys_df["short_citekey"] = citekeys_df.standard_citekey.map( shorten_citekey) citekeys_df = citekeys_df.sort_values( ["standard_citekey", "detagged_citekey"]) check_collisions(citekeys_df) check_multiple_citation_strings(citekeys_df) return citekeys_df
def standardize_id(self): """ Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field. The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field. If extracting the citation from the "id" field, uses the infer_citekey_prefix function to set the prefix. For example, if the extracted standard_id does not begin with a supported prefix (e.g. "doi:", "pmid:" or "raw:"), the citation is assumed to be raw and given a "raw:" prefix. The extracted citation is checked for validity and standardized, after which it is the final "standard_id". Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field is created or updated with key-value pairs for standard_id and original_id. Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey. However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field. """ original_id = self.get("id") self.infer_id() original_standard_id = self["id"] assert is_valid_citekey(original_standard_id, allow_raw=True) standard_id = standardize_citekey(original_standard_id, warn_if_changed=False) add_to_note = {} note_dict = self.note_dict if original_id and original_id != standard_id: if original_id != note_dict.get("original_id"): add_to_note["original_id"] = original_id if original_standard_id and original_standard_id != standard_id: if original_standard_id != note_dict.get("original_standard_id"): add_to_note["original_standard_id"] = original_standard_id if standard_id != note_dict.get("standard_id"): add_to_note["standard_id"] = standard_id self.note_append_dict(dictionary=add_to_note) self.set_id(standard_id) return self
def get_citekeys_df(args, text): """ Generate citekeys_df and save it to 'citations.tsv'. citekeys_df is a pandas.DataFrame with the following columns: - manuscript_citekey: citation keys extracted from the manuscript content files. - detagged_citekey: manuscript_citekey but with tag citekeys dereferenced - standard_citekey: detagged_citekey standardized - short_citekey: standard_citekey hashed to create a shortened citekey """ citekeys_df = pandas.DataFrame({"manuscript_citekey": get_citekeys(text)}) if args.citation_tags_path.is_file(): tag_df = pandas.read_csv(args.citation_tags_path, sep="\t") na_rows_df = tag_df[tag_df.isnull().any(axis="columns")] if not na_rows_df.empty: logging.error( f"{args.citation_tags_path} contains rows with missing values:\n" f"{na_rows_df}\n" "This error can be caused by using spaces rather than tabs to delimit fields.\n" "Proceeding to reread TSV with delim_whitespace=True." ) tag_df = pandas.read_csv(args.citation_tags_path, delim_whitespace=True) tag_df["manuscript_citekey"] = "tag:" + tag_df.tag tag_df = tag_df.rename(columns={"citation": "detagged_citekey"}) for detagged_citekey in tag_df.detagged_citekey: is_valid_citekey(detagged_citekey, allow_raw=True) citekeys_df = citekeys_df.merge( tag_df[["manuscript_citekey", "detagged_citekey"]], how="left" ) else: citekeys_df["detagged_citekey"] = None logging.info( f"missing {args.citation_tags_path} file: no citation tags (citekey aliases) set" ) citekeys_df.detagged_citekey.fillna( citekeys_df.manuscript_citekey.astype(str), inplace=True ) citekeys_df["standard_citekey"] = citekeys_df.detagged_citekey.map( standardize_citekey ) citekeys_df["short_citekey"] = citekeys_df.standard_citekey.map(shorten_citekey) citekeys_df = citekeys_df.sort_values(["standard_citekey", "detagged_citekey"]) citekeys_df.to_csv(args.citations_path, sep="\t", index=False) check_collisions(citekeys_df) check_multiple_citation_strings(citekeys_df) return citekeys_df
def csl_item_set_standard_id(csl_item): """ Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field. The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field. If extracting the citation from the "id" field, uses the infer_citekey_prefix function to set the prefix. For example, if the extracted standard_id does not begin with a supported prefix (e.g. "doi:", "pmid:" or "raw:"), the citation is assumed to be raw and given a "raw:" prefix. The extracted citation (referred to as "original_standard_id") is checked for validity and standardized, after which it is the final "standard_id". Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field is created or updated with key-value pairs for standard_id, original_standard_id, and original_id. Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey. However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field. """ if not isinstance(csl_item, dict): raise ValueError( "csl_item must be a CSL Data Item represented as a Python dictionary") from manubot.cite.citeproc import ( append_to_csl_item_note, parse_csl_item_note, ) note_dict = parse_csl_item_note(csl_item.get('note', '')) original_id = None original_standard_id = None if 'id' in csl_item: original_id = csl_item['id'] original_standard_id = infer_citekey_prefix(original_id) if 'standard_id' in note_dict: original_standard_id = note_dict['standard_id'] if 'standard_citation' in csl_item: original_standard_id = csl_item.pop('standard_citation') if original_standard_id is None: raise ValueError( 'csl_item_set_standard_id could not detect a field with a citation / standard_citation. ' 'Consider setting the CSL Item "id" field.') assert is_valid_citekey(original_standard_id, allow_raw=True) standard_id = standardize_citekey( original_standard_id, warn_if_changed=False) add_to_note = {} if original_id and original_id != standard_id: if original_id != note_dict.get('original_id'): add_to_note['original_id'] = original_id if original_standard_id and original_standard_id != standard_id: if original_standard_id != note_dict.get('original_standard_id'): add_to_note['original_standard_id'] = original_standard_id if standard_id != note_dict.get('standard_id'): add_to_note['standard_id'] = standard_id append_to_csl_item_note(csl_item, dictionary=add_to_note) csl_item['id'] = standard_id return csl_item
def get_citekeys(text): """ Extract the deduplicated list of citations in a text. Citations that are clearly invalid such as `doi:/453` are not returned. """ citekeys = set(citekey_pattern.findall(text)) citekeys = filter( lambda x: is_valid_citekey( x, allow_tag=True, allow_raw=True, allow_pandoc_xnos=True), citekeys, ) return sorted(citekeys)
def cli_cite(args): """ Main function for the manubot cite command-line interface. Does not allow user to directly specify Pandoc's --to argument, due to inconsistent citaiton rendering by output format. See https://github.com/jgm/pandoc/issues/4834 """ # generate CSL JSON data csl_list = list() for citekey in args.citekeys: try: if not is_valid_citekey(citekey): continue citekey = standardize_citekey(citekey) csl_item = citekey_to_csl_item(citekey, prune=args.prune_csl) csl_list.append(csl_item) except Exception as error: logging.error(f'citekey_to_csl_item for {citekey!r} failed ' f'due to a {error.__class__.__name__}:\n{error}') logging.info(error, exc_info=True) # output CSL JSON data, if --render is False if not args.render: write_file = args.output.open( 'w', encoding='utf-8') if args.output else sys.stdout with write_file: json.dump(csl_list, write_file, ensure_ascii=False, indent=2) write_file.write('\n') return # use Pandoc to render references if not args.format and args.output: vars(args)['format'] = extension_to_format.get(args.output.suffix) if not args.format: vars(args)['format'] = 'plain' pandoc_metadata = { 'nocite': '@*', 'csl': args.csl, 'references': csl_list, } call_pandoc( metadata=pandoc_metadata, path=args.output, format=args.format, )
def process_record(record): """ Expand a catalog record with retrieved metadata """ output = {} html_url = record.pop('html_url') output['manubot'] = { 'repo_url': record.pop('repo_url'), 'url': html_url, 'citation': f"url:{html_url}", } if 'thumbnail_url' in record: thumbnail_url = record.pop('thumbnail_url') else: thumbnail_url = get_thumbnail_url_from_html(html_url) if thumbnail_url: output['manubot']['thumbnail_url'] = thumbnail_url for publication_type in 'preprint', 'journal': citation = record.pop(f'{publication_type}_citation', None) if not citation: continue if not is_valid_citekey(citation): continue output[publication_type] = { 'citation': citation, } for item in output.values(): citation = standardize_citekey(item['citation']) csl_item = citekey_to_csl_item(citation) if 'url' not in item and 'URL' in csl_item: item['url'] = csl_item['URL'] item['title'] = get_title(csl_item) item['authors'] = get_authors_text(csl_item) item['journal'] = get_journal(csl_item) item['date_iso'] = get_date(csl_item) item['date_human'] = get_date_summary(csl_item) item['csl_item'] = csl_item output['extras'] = record return output
def process_citations(doc): """ Apply citation-by-identifier to a Python object representation of Pandoc's Abstract Syntax Tree. The following Pandoc metadata fields are considered: - bibliography (use to define reference metadata manually) - citekey-aliases (use to define tags for cite-by-id citations) - manubot-requests-cache-path - manubot-clear-requests-cache - manubot-output-citekeys: path to write TSV table of citekeys - manubot-output-bibliography: path to write generated CSL JSON bibliography """ citekey_aliases = doc.get_metadata("citekey-aliases", default={}) if not isinstance(citekey_aliases, dict): logging.warning( f"Expected metadata.citekey-aliases to be a dict, " f"but received a {citekey_aliases.__class__.__name__}. Disregarding." ) citekey_aliases = dict() global_variables["citekey_aliases"] = citekey_aliases doc.walk(_get_reference_link_citekey_aliases) doc.walk(_get_citekeys_action) manuscript_citekeys = global_variables["manuscript_citekeys"] manuscript_citekeys = sorted( filter( lambda x: is_valid_citekey( x, allow_tag=True, allow_raw=True, allow_pandoc_xnos=True ), set(manuscript_citekeys), ) ) global_variables["manuscript_citekeys"] = manuscript_citekeys citekeys_df = get_citekeys_df( manuscript_citekeys, global_variables["citekey_aliases"], ) global_variables["citekeys_df"] = citekeys_df global_variables["citekey_shortener"] = dict( zip((citekeys_df["manuscript_citekey"]), citekeys_df["short_citekey"]) ) doc.walk(_citation_to_id_action) manual_refs = doc.get_metadata("references", default=[]) bibliography_paths = doc.get_metadata("bibliography", default=[]) if not isinstance(bibliography_paths, list): bibliography_paths = [bibliography_paths] manual_refs = load_manual_references( bibliography_paths, extra_csl_items=manual_refs ) standard_citekeys = citekeys_df.standard_citekey.unique() requests_cache_path = doc.get_metadata("manubot-requests-cache-path") if requests_cache_path: pathlib.Path(requests_cache_path).parent.mkdir(parents=True, exist_ok=True) csl_items = generate_csl_items( citekeys=standard_citekeys, manual_refs=manual_refs, requests_cache_path=doc.get_metadata("manubot-requests-cache-path"), clear_requests_cache=doc.get_metadata("manubot-clear-requests-cache", False), ) write_citekeys_tsv(citekeys_df, path=doc.get_metadata("manubot-output-citekeys")) write_csl_json(csl_items, path=doc.get_metadata("manubot-output-bibliography")) # Update pandoc metadata with fields that this filter # has either consumed, created, or modified. doc.metadata["bibliography"] = [] doc.metadata["references"] = csl_items doc.metadata["citekey_aliases"] = citekey_aliases