Beispiel #1
0
def get_citekeys_df(citekeys: list, citekey_aliases: dict = {}):
    """
    Generate and return citekeys_df.
    citekeys_df is a pandas.DataFrame with the following columns:
    - manuscript_citekey: citation keys extracted from the manuscript content files.
    - detagged_citekey: manuscript_citekey but with tag citekeys dereferenced
    - standard_citekey: detagged_citekey standardized
    - short_citekey: standard_citekey hashed to create a shortened citekey
    """
    citekeys_df = pandas.DataFrame({
        "manuscript_citekey": list(citekeys)
    }).drop_duplicates()
    citekeys_df["detagged_citekey"] = citekeys_df.manuscript_citekey.map(
        lambda citekey: citekey_aliases.get(citekey, citekey))
    for citation in citekeys_df.detagged_citekey:
        is_valid_citekey(citation, allow_raw=True)
    citekeys_df["standard_citekey"] = citekeys_df.detagged_citekey.map(
        standardize_citekey)
    citekeys_df["short_citekey"] = citekeys_df.standard_citekey.map(
        shorten_citekey)
    citekeys_df = citekeys_df.sort_values(
        ["standard_citekey", "detagged_citekey"])
    check_collisions(citekeys_df)
    check_multiple_citation_strings(citekeys_df)
    return citekeys_df
Beispiel #2
0
    def standardize_id(self):
        """
        Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field.
        The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field.
        If extracting the citation from the "id" field, uses the infer_citekey_prefix function to set the prefix.
        For example, if the extracted standard_id does not begin with a supported prefix (e.g. "doi:", "pmid:" or "raw:"),
        the citation is assumed to be raw and given a "raw:" prefix.
        The extracted citation is checked for validity and standardized, after which it is the final "standard_id".

        Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field
        is created or updated with key-value pairs for standard_id and original_id.

        Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey.
        However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field.
        """
        original_id = self.get("id")
        self.infer_id()
        original_standard_id = self["id"]
        assert is_valid_citekey(original_standard_id, allow_raw=True)
        standard_id = standardize_citekey(original_standard_id,
                                          warn_if_changed=False)
        add_to_note = {}
        note_dict = self.note_dict
        if original_id and original_id != standard_id:
            if original_id != note_dict.get("original_id"):
                add_to_note["original_id"] = original_id
        if original_standard_id and original_standard_id != standard_id:
            if original_standard_id != note_dict.get("original_standard_id"):
                add_to_note["original_standard_id"] = original_standard_id
        if standard_id != note_dict.get("standard_id"):
            add_to_note["standard_id"] = standard_id
        self.note_append_dict(dictionary=add_to_note)
        self.set_id(standard_id)
        return self
Beispiel #3
0
def get_citekeys_df(args, text):
    """
    Generate citekeys_df and save it to 'citations.tsv'.
    citekeys_df is a pandas.DataFrame with the following columns:
    - manuscript_citekey: citation keys extracted from the manuscript content files.
    - detagged_citekey: manuscript_citekey but with tag citekeys dereferenced
    - standard_citekey: detagged_citekey standardized
    - short_citekey: standard_citekey hashed to create a shortened citekey
    """
    citekeys_df = pandas.DataFrame({"manuscript_citekey": get_citekeys(text)})
    if args.citation_tags_path.is_file():
        tag_df = pandas.read_csv(args.citation_tags_path, sep="\t")
        na_rows_df = tag_df[tag_df.isnull().any(axis="columns")]
        if not na_rows_df.empty:
            logging.error(
                f"{args.citation_tags_path} contains rows with missing values:\n"
                f"{na_rows_df}\n"
                "This error can be caused by using spaces rather than tabs to delimit fields.\n"
                "Proceeding to reread TSV with delim_whitespace=True."
            )
            tag_df = pandas.read_csv(args.citation_tags_path, delim_whitespace=True)
        tag_df["manuscript_citekey"] = "tag:" + tag_df.tag
        tag_df = tag_df.rename(columns={"citation": "detagged_citekey"})
        for detagged_citekey in tag_df.detagged_citekey:
            is_valid_citekey(detagged_citekey, allow_raw=True)
        citekeys_df = citekeys_df.merge(
            tag_df[["manuscript_citekey", "detagged_citekey"]], how="left"
        )
    else:
        citekeys_df["detagged_citekey"] = None
        logging.info(
            f"missing {args.citation_tags_path} file: no citation tags (citekey aliases) set"
        )
    citekeys_df.detagged_citekey.fillna(
        citekeys_df.manuscript_citekey.astype(str), inplace=True
    )
    citekeys_df["standard_citekey"] = citekeys_df.detagged_citekey.map(
        standardize_citekey
    )
    citekeys_df["short_citekey"] = citekeys_df.standard_citekey.map(shorten_citekey)
    citekeys_df = citekeys_df.sort_values(["standard_citekey", "detagged_citekey"])
    citekeys_df.to_csv(args.citations_path, sep="\t", index=False)
    check_collisions(citekeys_df)
    check_multiple_citation_strings(citekeys_df)
    return citekeys_df
Beispiel #4
0
def csl_item_set_standard_id(csl_item):
    """
    Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field.
    The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field.
    If extracting the citation from the "id" field, uses the infer_citekey_prefix function to set the prefix.
    For example, if the extracted standard_id does not begin with a supported prefix (e.g. "doi:", "pmid:"
    or "raw:"), the citation is assumed to be raw and given a "raw:" prefix. The extracted citation
    (referred to as "original_standard_id") is checked for validity and standardized, after which it is
    the final "standard_id".

    Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field
    is created or updated with key-value pairs for standard_id, original_standard_id, and original_id.

    Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey.
    However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field.
    """
    if not isinstance(csl_item, dict):
        raise ValueError(
            "csl_item must be a CSL Data Item represented as a Python dictionary")

    from manubot.cite.citeproc import (
        append_to_csl_item_note,
        parse_csl_item_note,
    )
    note_dict = parse_csl_item_note(csl_item.get('note', ''))

    original_id = None
    original_standard_id = None
    if 'id' in csl_item:
        original_id = csl_item['id']
        original_standard_id = infer_citekey_prefix(original_id)
    if 'standard_id' in note_dict:
        original_standard_id = note_dict['standard_id']
    if 'standard_citation' in csl_item:
        original_standard_id = csl_item.pop('standard_citation')
    if original_standard_id is None:
        raise ValueError(
            'csl_item_set_standard_id could not detect a field with a citation / standard_citation. '
            'Consider setting the CSL Item "id" field.')
    assert is_valid_citekey(original_standard_id, allow_raw=True)
    standard_id = standardize_citekey(
        original_standard_id, warn_if_changed=False)
    add_to_note = {}
    if original_id and original_id != standard_id:
        if original_id != note_dict.get('original_id'):
            add_to_note['original_id'] = original_id
    if original_standard_id and original_standard_id != standard_id:
        if original_standard_id != note_dict.get('original_standard_id'):
            add_to_note['original_standard_id'] = original_standard_id
    if standard_id != note_dict.get('standard_id'):
        add_to_note['standard_id'] = standard_id
    append_to_csl_item_note(csl_item, dictionary=add_to_note)
    csl_item['id'] = standard_id
    return csl_item
Beispiel #5
0
def get_citekeys(text):
    """
    Extract the deduplicated list of citations in a text. Citations that are
    clearly invalid such as `doi:/453` are not returned.
    """
    citekeys = set(citekey_pattern.findall(text))
    citekeys = filter(
        lambda x: is_valid_citekey(
            x, allow_tag=True, allow_raw=True, allow_pandoc_xnos=True),
        citekeys,
    )
    return sorted(citekeys)
Beispiel #6
0
def cli_cite(args):
    """
    Main function for the manubot cite command-line interface.

    Does not allow user to directly specify Pandoc's --to argument, due to
    inconsistent citaiton rendering by output format. See
    https://github.com/jgm/pandoc/issues/4834
    """
    # generate CSL JSON data
    csl_list = list()
    for citekey in args.citekeys:
        try:
            if not is_valid_citekey(citekey):
                continue
            citekey = standardize_citekey(citekey)
            csl_item = citekey_to_csl_item(citekey, prune=args.prune_csl)
            csl_list.append(csl_item)
        except Exception as error:
            logging.error(f'citekey_to_csl_item for {citekey!r} failed '
                          f'due to a {error.__class__.__name__}:\n{error}')
            logging.info(error, exc_info=True)

    # output CSL JSON data, if --render is False
    if not args.render:
        write_file = args.output.open(
            'w', encoding='utf-8') if args.output else sys.stdout
        with write_file:
            json.dump(csl_list, write_file, ensure_ascii=False, indent=2)
            write_file.write('\n')
        return

    # use Pandoc to render references
    if not args.format and args.output:
        vars(args)['format'] = extension_to_format.get(args.output.suffix)
    if not args.format:
        vars(args)['format'] = 'plain'
    pandoc_metadata = {
        'nocite': '@*',
        'csl': args.csl,
        'references': csl_list,
    }
    call_pandoc(
        metadata=pandoc_metadata,
        path=args.output,
        format=args.format,
    )
Beispiel #7
0
def process_record(record):
    """
    Expand a catalog record with retrieved metadata
    """
    output = {}
    html_url = record.pop('html_url')
    output['manubot'] = {
        'repo_url': record.pop('repo_url'),
        'url': html_url,
        'citation': f"url:{html_url}",
    }
    if 'thumbnail_url' in record:
        thumbnail_url = record.pop('thumbnail_url')
    else:
        thumbnail_url = get_thumbnail_url_from_html(html_url)
    if thumbnail_url:
        output['manubot']['thumbnail_url'] = thumbnail_url
    for publication_type in 'preprint', 'journal':
        citation = record.pop(f'{publication_type}_citation', None)
        if not citation:
            continue
        if not is_valid_citekey(citation):
            continue
        output[publication_type] = {
            'citation': citation,
        }
    for item in output.values():
        citation = standardize_citekey(item['citation'])
        csl_item = citekey_to_csl_item(citation)
        if 'url' not in item and 'URL' in csl_item:
            item['url'] = csl_item['URL']
        item['title'] = get_title(csl_item)
        item['authors'] = get_authors_text(csl_item)
        item['journal'] = get_journal(csl_item)
        item['date_iso'] = get_date(csl_item)
        item['date_human'] = get_date_summary(csl_item)
        item['csl_item'] = csl_item
    output['extras'] = record
    return output
Beispiel #8
0
def process_citations(doc):
    """
    Apply citation-by-identifier to a Python object representation of
    Pandoc's Abstract Syntax Tree.

    The following Pandoc metadata fields are considered:

    - bibliography (use to define reference metadata manually)
    - citekey-aliases (use to define tags for cite-by-id citations)
    - manubot-requests-cache-path
    - manubot-clear-requests-cache
    - manubot-output-citekeys: path to write TSV table of citekeys
    - manubot-output-bibliography: path to write generated CSL JSON bibliography
    """
    citekey_aliases = doc.get_metadata("citekey-aliases", default={})
    if not isinstance(citekey_aliases, dict):
        logging.warning(
            f"Expected metadata.citekey-aliases to be a dict, "
            f"but received a {citekey_aliases.__class__.__name__}. Disregarding."
        )
        citekey_aliases = dict()

    global_variables["citekey_aliases"] = citekey_aliases
    doc.walk(_get_reference_link_citekey_aliases)
    doc.walk(_get_citekeys_action)
    manuscript_citekeys = global_variables["manuscript_citekeys"]
    manuscript_citekeys = sorted(
        filter(
            lambda x: is_valid_citekey(
                x, allow_tag=True, allow_raw=True, allow_pandoc_xnos=True
            ),
            set(manuscript_citekeys),
        )
    )
    global_variables["manuscript_citekeys"] = manuscript_citekeys
    citekeys_df = get_citekeys_df(
        manuscript_citekeys, global_variables["citekey_aliases"],
    )
    global_variables["citekeys_df"] = citekeys_df
    global_variables["citekey_shortener"] = dict(
        zip((citekeys_df["manuscript_citekey"]), citekeys_df["short_citekey"])
    )
    doc.walk(_citation_to_id_action)
    manual_refs = doc.get_metadata("references", default=[])
    bibliography_paths = doc.get_metadata("bibliography", default=[])
    if not isinstance(bibliography_paths, list):
        bibliography_paths = [bibliography_paths]
    manual_refs = load_manual_references(
        bibliography_paths, extra_csl_items=manual_refs
    )
    standard_citekeys = citekeys_df.standard_citekey.unique()
    requests_cache_path = doc.get_metadata("manubot-requests-cache-path")
    if requests_cache_path:
        pathlib.Path(requests_cache_path).parent.mkdir(parents=True, exist_ok=True)
    csl_items = generate_csl_items(
        citekeys=standard_citekeys,
        manual_refs=manual_refs,
        requests_cache_path=doc.get_metadata("manubot-requests-cache-path"),
        clear_requests_cache=doc.get_metadata("manubot-clear-requests-cache", False),
    )
    write_citekeys_tsv(citekeys_df, path=doc.get_metadata("manubot-output-citekeys"))
    write_csl_json(csl_items, path=doc.get_metadata("manubot-output-bibliography"))
    # Update pandoc metadata with fields that this filter
    # has either consumed, created, or modified.
    doc.metadata["bibliography"] = []
    doc.metadata["references"] = csl_items
    doc.metadata["citekey_aliases"] = citekey_aliases