def update_individual(path, keyword): """ Select authors for an individual manuscript. Expects the manuscript keyword to be in a dictionary called manuscripts for each author of that manuscript. Updates contributions to be the manuscript-specific contributions. Builds the list of consortium members. """ metadata = read_serialized_data(path) authors = metadata.get("authors", []) metadata["consortiummembers"] = generate_consortium_members(authors) individual_authors = [author for author in authors if "manuscripts" in author and keyword in author["manuscripts"]] # Sort authors by their numeric order for this individual manuscript # If the author has the manuscript keyword, which indicates authorship, but not an order # the default order is -1, which should move them to the front of the author list # Sort by name to break ties individual_authors.sort(key=lambda author: (author["manuscripts"][keyword].get("order", -1), author["name"])) # Set contributions to the appropriate manuscript-specific contributions for author in individual_authors: # A list of the author's contributions for this manuscript contributions = author["manuscripts"][keyword].get("contributions", MISSING_CONTRIBUTIONS) if contributions == MISSING_CONTRIBUTIONS: sys.stderr.write(f"Missing {keyword} contributions for {author['name']}\n") author["contributions"] = sorted(contributions) sys.stderr.write(f"Found {len(individual_authors)} authors for {keyword} manuscript\n") metadata["authors"] = individual_authors dump_yaml(metadata, path)
def load_bibliography(path) -> list: """ Load a bibliography as CSL Items (a CSL JSON Python object). For paths that already contain CSL Items (inferred from a .json or .yaml extension), parse these files directly. Otherwise, delegate conversion to CSL Items to pandoc-citeproc. """ path = pathlib.Path(path) if path.suffix in {".json", ".yaml"}: try: csl_items = read_serialized_data(path) except Exception: logging.exception( f"process.load_bibliography: error parsing {path}.\n") csl_items = [] else: from manubot.pandoc.bibliography import ( load_bibliography as load_bibliography_pandoc, ) csl_items = load_bibliography_pandoc(path) if not isinstance(csl_items, list): logging.error( f"process.load_bibliography: csl_items read from {path} are of type {type(csl_items)}. " "Setting csl_items to an empty list.") csl_items = [] from manubot.cite.csl_item import CSL_Item csl_items = [CSL_Item(csl_item) for csl_item in csl_items] return csl_items
def load_bibliography(path: str) -> list: """ Load a bibliography as CSL Items (a CSL JSON Python object). For paths that already contain CSL Items (inferred from a .json or .yaml extension), parse these files directly (URLs supported). Otherwise, delegate conversion to CSL Items to pandoc-citeproc (URLs not supported). If loading fails, log an error and return an empty list. """ path_obj = pathlib.Path(path) if path_obj.suffix in {".json", ".yaml"}: try: csl_items = read_serialized_data(path) except Exception as error: logging.error(f"load_bibliography: error reading {path!r}.\n{error}") logging.info("load_bibliography exception info", exc_info=True) csl_items = [] else: from manubot.pandoc.bibliography import ( load_bibliography as load_bibliography_pandoc, ) csl_items = load_bibliography_pandoc(path) if not isinstance(csl_items, list): logging.error( f"process.load_bibliography: csl_items read from {path} are of type {type(csl_items)}. " "Setting csl_items to an empty list." ) csl_items = [] from manubot.cite.csl_item import CSL_Item csl_items = [CSL_Item(csl_item) for csl_item in csl_items] return csl_items
def read_variable_files(paths: List[str], variables: Optional[dict] = None) -> dict: """ Read multiple serialized data files into a user_variables dictionary. Provide `paths` (a list of URLs or local file paths). Paths can optionally have a namespace prepended. For example: ```python paths = [ 'https://git.io/vbkqm', # update the dictionary's top-level 'namespace_1=https://git.io/vbkqm', # store under 'namespace_1' key 'namespace_2=some_local_path.json', # store under 'namespace_2' key ] ``` If a namespace is not provided, the JSON must contain a dictionary as its top level. Namespaces should consist only of ASCII alphanumeric characters (includes underscores, first character cannot be numeric). Pass a dictionary to `variables` to update an existing dictionary rather than create a new dictionary. """ if variables is None: variables = {} for path in paths: logging.info(f"Reading user-provided templating variables at {path!r}") # Match only namespaces that are valid jinja2 variable names # http://jinja.pocoo.org/docs/2.10/api/#identifier-naming match = re.match(r"([a-zA-Z_][a-zA-Z0-9_]*)=(.+)", path) if match: namespace, path = match.groups() logging.info( f"Using the {namespace!r} namespace for template variables from {path!r}" ) try: if match: obj = {namespace: read_serialized_data(path)} else: obj = read_serialized_dict(path) except Exception: logging.exception( f"Error reading template variables from {path!r}") continue assert isinstance(obj, dict) conflicts = variables.keys() & obj.keys() if conflicts: logging.warning( f"Template variables in {path!r} overwrite existing " "values for the following keys:\n" + "\n".join(conflicts)) variables.update(obj) logging.debug(f"Reading user-provided templating variables complete:\n" f"{json.dumps(variables, indent=2, ensure_ascii=False)}") return variables
def get_jsonschema_csl_validator(): """ Return a jsonschema validator for the CSL Item JSON Schema """ import jsonschema url = "https://github.com/dhimmel/csl-schema/raw/manubot/csl-data.json" schema = read_serialized_data(url) Validator = jsonschema.validators.validator_for(schema) Validator.check_schema(schema) return Validator(schema)
def update_merged(path): """ Update author contributions for the merged manuscript by taking the union of all contributions on individual manuscripts. Overwrites existing contributions for the author that are not associated with an individual manuscript. Builds the list of consortium members. """ metadata = read_serialized_data(path) authors = metadata.get("authors", []) metadata["consortiummembers"] = generate_consortium_members(authors) # Set contributions to the union of all manuscript-specific contributions # Use general contributions if there are no manuscript-specific contributions for author in authors: contributions = set() if "manuscripts" in author: for manuscript in author["manuscripts"].keys(): # A list of the author's contributions for each individual manuscript individual_contributions = author["manuscripts"][ manuscript].get("contributions", MISSING_CONTRIBUTIONS) contributions.update(individual_contributions) elif "contributions" in author: contributions.update(author["contributions"]) else: contributions.update(MISSING_CONTRIBUTIONS) if MISSING_CONTRIBUTIONS[0] in contributions: sys.stderr.write(f"Missing contributions for {author['name']}\n") author["contributions"] = sorted(contributions) # Check whether code of conduct has been approved if "code of conduct" not in author or "confirmed" not in author[ "code of conduct"] or not author["code of conduct"][ "confirmed"]: sys.stderr.write( f"{author['name']} has not approved the code of conduct\n") sys.stderr.write( f"Updating contributions for {len(authors)} authors for merged manuscript\n" ) metadata["authors"] = authors dump_yaml(metadata, path)
with path.open("w", encoding="utf-8") as write_file: yaml.dump( obj, write_file, # default_flow_style=False, explicit_start=True, explicit_end=True, width=float("inf"), sort_keys=False, allow_unicode=True, ) write_file.write("\n") if __name__ == "__main__": """ Alternative to https://github.com/manubot/manubot/pull/214 """ args = parse_args() if not args.execute: sys.stderr.write( "Exiting without doing anything due to --only-on-ci\n") sys.exit() metadata = read_serialized_data(args.path) authors = metadata.get("authors", []) if args.shuffle: sys.stderr.write("Shuffling metadata.authors\n") seed = get_head_commit() shuffle(authors, seed=seed) dump_yaml(metadata, args.path)
def update_latex(keyword, manubot_file, pandoc_file): """ Update Manubot author metadata for an individual manuscript in preparation for LaTeX conversion. Prepares a metadata file to override and supplement the metadata in the YAML block in the Markdown file Pandoc processes. """ metadata = read_serialized_data(manubot_file) authors = metadata.get("authors", []) individual_authors = [author for author in authors if "manuscripts" in author and keyword in author["manuscripts"]] # Sort authors by their numeric order for this individual manuscript # If the author has the manuscript keyword, which indicates authorship, but not an order # the default order is -1, which should move them to the front of the author list # Sort by name to break ties individual_authors.sort(key=lambda author: (author["manuscripts"][keyword].get("order", -1), author["name"])) # Set affiliation fields to the manuscript-specific affiliation formatting expected by the LaTeX template # and discard fields that will not be used by the LaTeX template keep_fields = {"name", "email", "orcid"} # do not keep the old affiliations latex_authors = [] conflicts = [] funding = [] for author in individual_authors: latex_author = {field: author[field] for field in keep_fields if field in author} # A list of the author's affiliations formatted for the template # The first affiliation is stored in the "affiliations" field # Any additional affiliations are stored in the "additionalaffiliations" field affiliations = author["manuscripts"][keyword].get("affiliations", MISSING_AFFILIATIONS) if affiliations == MISSING_AFFILIATIONS: sys.stderr.write(f"Missing {keyword} affiliations for {author['name']}\n") if len(affiliations) > 0: latex_author["affiliations"] = affiliations[0] if len(affiliations) > 1: latex_author["additionalaffiliations"] = affiliations[1:] latex_authors.append(latex_author) # Check whether the author has declared conflicts of interest if "coi" in author: conflict = author["coi"].get("string", MISSING_COI) if conflict != "None": conflicts.append(f"{author['name']}: {conflict}.") # Check whether the author has funding # This text will not be used directly but will help write a funding statement manually if "funders" in author: # Less robust handling of funders field than Manubot # https://github.com/manubot/manubot/blob/3ff3000f76dcf82a30694d076a4da95326e3f6ae/manubot/process/util.py#L78 funders = author["funders"] if isinstance(funders, list): funders = "; ".join(funders) # Assumes initials are always provided funding.append(f"{author['initials']}: {funders}.") sys.stderr.write(f"Found {len(latex_authors)} authors for {keyword} manuscript\n") # Do not retain the other metadata fields and add the .bib file references metadata = {"author": latex_authors, "bibfile": keyword + ".bib"} metadata.update(ACM_DISCO_2021) # Add conflicts if any exist if len(conflicts) > 0: metadata["conflicts"] = "Conflicts of interest. " + " ".join(conflicts) # Add funding comment if funders were listed if len(funding) > 0: metadata["funding"] = "Author funding. " + " ".join(funding) dump_yaml(metadata, pandoc_file)