def update_execution_cache(app: Sphinx, builder: Builder, added: Set[str], changed: Set[str], removed: Set[str]): """If caching is required, stage and execute the added or modified notebooks, and cache them for later retrieval. This is called by sphinx as an `env-get-outdated` event, which is emitted when the environment determines which source files have changed and should be re-read. """ # all the added and changed notebooks should be operated on. # note docnames are paths relative to the sphinx root folder, with no extensions altered_docnames = added.union(changed) exec_docnames = [ docname for docname in altered_docnames if is_valid_exec_file(app.env, docname) ] LOGGER.verbose("MyST-NB: Potential docnames to execute: %s", exec_docnames) if app.config["jupyter_execute_notebooks"] == "cache": app.env.nb_path_to_cache = str( app.config["jupyter_cache"] or Path(app.outdir).parent.joinpath(".jupyter_cache")) cache_base = get_cache(app.env.nb_path_to_cache) for path in removed: if path in app.env.nb_execution_data: app.env.nb_execution_data_changed = True app.env.nb_execution_data.pop(path, None) docpath = app.env.doc2path(path) # there is an issue in sphinx doc2path, whereby if the path does not # exist then it will be assigned the default source_suffix (usually .rst) # therefore, to be safe here, we run through all possible suffixes for suffix in app.env.nb_allowed_exec_suffixes: docpath = os.path.splitext(docpath)[0] + suffix if not os.path.exists(docpath): cache_base.discard_staged_notebook(docpath) _stage_and_execute( env=app.env, exec_docnames=exec_docnames, path_to_cache=app.env.nb_path_to_cache, timeout=app.config["execution_timeout"], allow_errors=app.config["execution_allow_errors"], exec_in_temp=app.config["execution_in_temp"], ) return []
def cat_artifact(cache_path, pk, artifact_rpath): """Print the contents of a cached artefact.""" db = get_cache(cache_path) with db.cache_artefacts_temppath(pk) as path: artifact_path = path.joinpath(artifact_rpath) if not artifact_path.exists(): click.secho("Artifact does not exist", fg="red") sys.exit(1) if not artifact_path.is_file(): click.secho("Artifact is not a file", fg="red") sys.exit(1) text = artifact_path.read_text() click.echo(text)
def execute_nbs(cache_path, entry_point, pks, timeout): """Execute staged notebooks that are outdated.""" import yaml from jupyter_cache.executors import load_executor db = get_cache(cache_path) try: executor = load_executor("basic", db, logger=logger) except ImportError as error: logger.error(str(error)) return 1 result = executor.run_and_cache(filter_pks=pks or None, timeout=timeout) click.secho("Finished! Successfully executed notebooks have been cached.", fg="green") click.echo(yaml.safe_dump(result, sort_keys=False))
def remove_caches(cache_path, pks, remove_all): """Remove notebooks stored in the cache.""" from jupyter_cache.base import CachingError db = get_cache(cache_path) if remove_all: pks = [r.pk for r in db.list_cache_records()] for pk in pks: # TODO deal with errors (print all at end? or option to ignore) click.echo("Removing Cache ID = {}".format(pk)) try: db.remove_cache(pk) except KeyError: click.secho("Does not exist", fg="red") except CachingError as err: click.secho("Error: ", fg="red") click.echo(str(err)) click.secho("Success!", fg="green")
def _stage_and_execute( env: BuildEnvironment, exec_docnames: List[str], path_to_cache: str, timeout: Optional[int], allow_errors: bool, exec_in_temp: bool, ): pk_list = [] cache_base = get_cache(path_to_cache) for nb in exec_docnames: source_path = env.doc2path(nb) with open(source_path, encoding="utf8") as handle: # here we pass an iterator, so that only the required lines are read converter = get_nb_converter(source_path, env, (line for line in handle)) if converter is not None: stage_record = cache_base.stage_notebook_file(source_path) pk_list.append(stage_record.pk) # can leverage parallel execution implemented in jupyter-cache here try: with progress_message("executing outdated notebooks"): execute_staged_nb( cache_base, pk_list or None, timeout=timeout, exec_in_temp=exec_in_temp, allow_errors=allow_errors, env=env, ) except OSError as err: # This is a 'fix' for obscure cases, such as if you # remove name.ipynb and add name.md (i.e. same name, different extension) # and then name.ipynb isn't flagged for removal. # Normally we want to keep the stage records available, so that we can retrieve # execution tracebacks at the `generate_notebook_outputs` stage, # but we need to flush if it becomes 'corrupted' LOGGER.error( "Execution failed in an unexpected way, clearing staged notebooks: %s", err) for record in cache_base.list_staged_records(): cache_base.discard_staged_notebook(record.pk)
def list_caches(cache_path, latest_only, hashkeys, path_length): """List cached notebook records in the cache.""" db = get_cache(cache_path) records = db.list_cache_records() if not records: click.secho("No Cached Notebooks", fg="blue") # TODO optionally list number of artifacts if latest_only: latest_records = {} for record in records: if record.uri not in latest_records: latest_records[record.uri] = record continue if latest_records[record.uri].created < record.created: latest_records[record.uri] = record records = list(latest_records.values()) click.echo( tabulate_cache_records(records, hashkeys=hashkeys, path_length=path_length) )
def show_cache(cache_path, pk): """Show details of a cached notebook in the cache.""" import yaml db = get_cache(cache_path) try: record = db.get_cache_record(pk) except KeyError: click.secho("ID {} does not exist, Aborting!".format(pk), fg="red") sys.exit(1) data = record.format_dict(hashkey=True, path_length=None) click.echo(yaml.safe_dump(data, sort_keys=False), nl=False) with db.cache_artefacts_temppath(pk) as folder: paths = [str(p.relative_to(folder)) for p in folder.glob("**/*") if p.is_file()] if not paths: click.echo("") return if paths: click.echo(f"Artifacts:") for path in paths: click.echo(f"- {path}")
def show_staged(cache_path, pk, tb): """Show details of a staged notebook.""" import yaml db = get_cache(cache_path) try: record = db.get_staged_record(pk) except KeyError: click.secho("ID {} does not exist, Aborting!".format(pk), fg="red") sys.exit(1) cache_record = db.get_cache_record_of_staged(record.uri) data = record.format_dict(cache_record=cache_record, path_length=None, assets=False) click.echo(yaml.safe_dump(data, sort_keys=False).rstrip()) if record.assets: click.echo("Assets:") for path in record.assets: click.echo(f"- {path}") if record.traceback: click.secho("Failed Last Execution!", fg="red") if tb: click.echo(record.traceback)
def stage_nb(cache_path, nbpath, asset_paths): """Stage a notebook, with possible asset files.""" db = get_cache(cache_path) db.stage_notebook_file(nbpath, asset_paths) click.secho("Success!", fg="green")
def diff_nb(cache_path, pk, nbpath): """Print a diff of a notebook to one stored in the cache.""" db = get_cache(cache_path) click.echo(db.diff_nbfile_with_cache(pk, nbpath, as_str=True)) click.secho("Success!", fg="green")
def notebook_execute(options, status): # if this is a re-execution of a previously loaded kernel, # make sure the underlying python version hasn't changed python_cmd = options.get("python_cmd", None) if python_cmd: if hasattr(notebook_execute, "python_cmd"): if notebook_execute.python_cmd != python_cmd: raise RestartKernel else: notebook_execute.python_cmd = python_cmd # unpack options input = options["target"]["input"] format = options["format"] resource_dir = options["resourceDir"] params = options.get("params", None) run_path = options.get("cwd", "") quiet = options.get('quiet', False) # change working directory and strip dir off of paths original_input = input os.chdir(Path(input).parent) input = Path(input).name # read variables out of format execute = format["execute"] eval = execute["eval"] allow_errors = bool(execute["error"]) fig_width = execute["fig-width"] fig_height = execute["fig-height"] fig_format = execute["fig-format"] fig_dpi = execute["fig-dpi"] if "cache" in execute: cache = execute["cache"] else: cache = "user" # set environment variables os.environ["QUARTO_FIG_WIDTH"] = str(fig_width) os.environ["QUARTO_FIG_HEIGHT"] = str(fig_height) if fig_format == "retina": os.environ["QUARTO_FIG_DPI"] = str(fig_dpi * 2) os.environ["QUARTO_FIG_FORMAT"] = "png" else: os.environ["QUARTO_FIG_DPI"] = str(fig_dpi) os.environ["QUARTO_FIG_FORMAT"] = fig_format # read the notebook nb = nbformat.read(input, as_version=NB_FORMAT_VERSION) # inject parameters if provided if params: nb_parameterize(nb, params) # insert setup cell setup_cell = nb_setup_cell(nb.metadata.kernelspec, resource_dir, fig_width, fig_height, fig_format, fig_dpi, run_path) nb.cells.insert(0, setup_cell) # are we using the cache, if so connect to the cache, and then if we aren't in 'refresh' # (forced re-execution) mode then try to satisfy the execution request from the cache if cache == True or cache == "refresh": if not get_cache: raise ImportError( 'The jupyter-cache package is required for cached execution') nb_cache = get_cache(".jupyter_cache") if not cache == "refresh": cached_nb = nb_from_cache(nb, nb_cache) if cached_nb: cached_nb.cells.pop(0) nb_write(cached_nb, input) status("(Notebook read from cache)\n\n") return True # can persist kernel else: nb_cache = None # create resources for execution resources = dict({"metadata": { "input": original_input, }}) if run_path: resources["metadata"]["path"] = run_path # create NotebookClient client, created = notebook_init(nb, resources, allow_errors) # complete progress if necessary if (not quiet) and created: status("Done\n") # compute total code cells (for progress) current_code_cell = 1 total_code_cells = sum(cell.cell_type == 'code' for cell in client.nb.cells) # execute the cells for index, cell in enumerate(client.nb.cells): # progress progress = (not quiet) and cell.cell_type == 'code' and index > 0 if progress: status(" Cell {0}/{1}...".format(current_code_cell - 1, total_code_cells - 1)) # clear cell output cell = cell_clear_output(cell) # execute cell if cell.cell_type == 'code': cell = cell_execute( client, cell, index, current_code_cell, eval, index > 0 # add_to_history ) cell.execution_count = current_code_cell # if this was the setup cell, see if we need to exit b/c dependencies are out of date if index == 0: kernel_deps = nb_kernel_depenencies(cell) if kernel_deps: if hasattr(notebook_execute, "kernel_deps"): for path in kernel_deps.keys(): if path in notebook_execute.kernel_deps.keys(): if notebook_execute.kernel_deps[ path] != kernel_deps[path]: raise RestartKernel else: notebook_execute.kernel_deps[path] = kernel_deps[ path] else: notebook_execute.kernel_deps = kernel_deps else: notebook_execute.kernel_deps = {} # we are done w/ setup (with no restarts) so it's safe to print 'Executing...' if not quiet: status("\nExecuting '{0}'\n".format(input)) # assign cell client.nb.cells[index] = cell # increment current code cell if cell.cell_type == 'code': current_code_cell += 1 # end progress if progress: status("Done\n") # set widgets metadata client.set_widgets_metadata() # write to the cache if nb_cache: nb_write(client.nb, input) nb_cache.cache_notebook_file(path=Path(input), overwrite=True) # remove setup cell (then renumber execution_Count) client.nb.cells.pop(0) for index, cell in enumerate(client.nb.cells): if cell.cell_type == 'code': cell.execution_count = cell.execution_count - 1 # re-write without setup cell nb_write(client.nb, input) # execute cleanup cell cleanup_cell = nb_cleanup_cell(nb.metadata.kernelspec, resource_dir) nb.cells.append(cleanup_cell) client.execute_cell(cell=cleanup_cell, cell_index=len(client.nb.cells) - 1, store_history=False) nb.cells.pop() # progress if not quiet: status("\n") # return flag indicating whether we should persist persist = notebook_execute.kernel_deps != None return persist
def cache_nb(cache_path, artifact_paths, nbpath, validate, overwrite): """Cache a notebook, with possible artefact files.""" db = get_cache(cache_path) success = cache_file(db, nbpath, validate, overwrite, artifact_paths) if success: click.secho("Success!", fg="green")
def generate_notebook_outputs( env: BuildEnvironment, ntbk: nbf.NotebookNode, file_path: Optional[str] = None, show_traceback: bool = False, ) -> nbf.NotebookNode: """ Add outputs to a NotebookNode by pulling from cache. Function to get the database instance. Get the cached output of the notebook and merge it with the original notebook. If there is no cached output, checks if there was error during execution, then saves the traceback to a log file. """ # check if the file is of a format that may be associated with outputs if not is_valid_exec_file(env, env.docname): return ntbk # If we have a jupyter_cache, see if there's a cache for this notebook file_path = file_path or env.doc2path(env.docname) execution_method = env.config["jupyter_execute_notebooks"] # type: str path_to_cache = env.nb_path_to_cache if "cache" in execution_method else None if not path_to_cache and "off" in execution_method: return ntbk if not path_to_cache: if execution_method == "auto" and nb_has_all_output(file_path): LOGGER.info( "Did not execute %s. " "Set jupyter_execute_notebooks to `force` to execute", env.docname, ) else: if env.config["execution_in_temp"]: with tempfile.TemporaryDirectory() as tmpdirname: LOGGER.info("Executing: %s in temporary directory", env.docname) result = single_nb_execution( ntbk, cwd=tmpdirname, timeout=env.config["execution_timeout"], allow_errors=env.config["execution_allow_errors"], ) else: cwd = Path(file_path).parent LOGGER.info("Executing: %s in: %s", env.docname, cwd) result = single_nb_execution( ntbk, cwd=cwd, timeout=env.config["execution_timeout"], allow_errors=env.config["execution_allow_errors"], ) report_path = None if result.err: report_path, message = _report_exec_fail( env, Path(file_path).name, result.exc_string, show_traceback, "Execution Failed with traceback saved in {}", ) LOGGER.error(message) ntbk = result.nb env.nb_execution_data_changed = True env.nb_execution_data[env.docname] = { "mtime": datetime.now().timestamp(), "runtime": result.time, "method": execution_method, "succeeded": False if result.err else True, } if report_path: env.nb_execution_data[env.docname]["error_log"] = report_path return ntbk cache_base = get_cache(path_to_cache) # Use relpath here in case Sphinx is building from a non-parent folder r_file_path = Path(os.path.relpath(file_path, Path().resolve())) # default execution data runtime = None succeeded = False report_path = None try: pk, ntbk = cache_base.merge_match_into_notebook(ntbk) except KeyError: message = ( f"Couldn't find cache key for notebook file {str(r_file_path)}. " "Outputs will not be inserted.") try: stage_record = cache_base.get_staged_record(file_path) except KeyError: stage_record = None if stage_record and stage_record.traceback: report_path, suffix = _report_exec_fail( env, r_file_path.name, stage_record.traceback, show_traceback, "\n Last execution failed with traceback saved in {}", ) message += suffix LOGGER.error(message) else: LOGGER.verbose("Merged cached outputs into %s", str(r_file_path)) succeeded = True try: runtime = cache_base.get_cache_record(pk).data.get( "execution_seconds", None) except Exception: pass env.nb_execution_data_changed = True env.nb_execution_data[env.docname] = { "mtime": datetime.now().timestamp(), "runtime": runtime, "method": execution_method, "succeeded": succeeded, } if report_path: env.nb_execution_data[env.docname]["error_log"] = report_path return ntbk
def start_client(self): # setup the cache cache = get_cache(self.nb_config.execution_cache_path or ".jupyter_cache") # TODO config on what notebook/cell metadata to hash/merge # attempt to match the notebook to one in the cache cache_record = None with suppress(KeyError): cache_record = cache.match_cache_notebook(self.notebook) # use the cached notebook if it exists if cache_record is not None: self.logger.info(f"Using cached notebook: ID={cache_record.pk}") _, self._notebook = cache.merge_match_into_notebook(self.notebook) self.exec_metadata = { "mtime": cache_record.created.timestamp(), "runtime": cache_record.data.get("execution_seconds", None), "method": self.nb_config.execution_mode, "succeeded": True, "error": None, "traceback": None, } return if self.path is None: raise ValueError( "Input source must exist as file, if execution_mode is 'cache'" ) # attempt to execute the notebook read_fmt = self._kwargs.get("read_fmt", None) if read_fmt is not None: stage_record = cache.add_nb_to_project(str(self.path), read_data=read_fmt) else: stage_record = cache.add_nb_to_project(str(self.path)) # TODO do in try/except, in case of db write errors NbProjectRecord.remove_tracebacks([stage_record.pk], cache.db) cwd_context: ContextManager[str] = ( TemporaryDirectory() # type: ignore if self.nb_config.execution_in_temp else nullcontext( str(self.path.parent))) with cwd_context as cwd: cwd = os.path.abspath(cwd) self.logger.info("Executing notebook using " + ( "temporary" if self.nb_config.execution_in_temp else "local") + " CWD") result = single_nb_execution( self.notebook, cwd=cwd, allow_errors=self.nb_config.execution_allow_errors, timeout=self.nb_config.execution_timeout, meta_override=True, # TODO still support this? ) # handle success / failure cases # TODO do in try/except to be careful (in case of database write errors? if result.err is not None: if self.nb_config.execution_raise_on_error: raise ExecutionError(str(self.path)) from result.err msg = f"Executing notebook failed: {result.err.__class__.__name__}" if self.nb_config.execution_show_tb: msg += f"\n{result.exc_string}" self.logger.warning(msg, subtype="exec") NbProjectRecord.set_traceback(stage_record.uri, result.exc_string, cache.db) else: self.logger.info(f"Executed notebook in {result.time:.2f} seconds") cache_record = cache.cache_notebook_bundle( CacheBundleIn( self.notebook, stage_record.uri, data={"execution_seconds": result.time}, ), check_validity=False, overwrite=True, ) self.logger.info(f"Cached executed notebook: ID={cache_record.pk}") self.exec_metadata = { "mtime": datetime.now().timestamp(), "runtime": result.time, "method": self.nb_config.execution_mode, "succeeded": False if result.err else True, "error": f"{result.err.__class__.__name__}" if result.err else None, "traceback": result.exc_string if result.err else None, }
def add_notebook_outputs(env, ntbk, file_path=None, show_traceback=False): """ Add outputs to a NotebookNode by pulling from cache. Function to get the database instance. Get the cached output of the notebook and merge it with the original notebook. If there is no cached output, checks if there was error during execution, then saves the traceback to a log file. """ # If we have a jupyter_cache, see if there's a cache for this notebook file_path = file_path or env.doc2path(env.docname) dest_path = Path(env.app.outdir) reports_dir = str(dest_path) + "/reports" path_cache = False if not is_valid_exec_file(env, env.docname): return ntbk if "cache" in env.config["jupyter_execute_notebooks"]: path_cache = env.path_cache if not path_cache: if "off" not in env.config["jupyter_execute_notebooks"]: has_outputs = _read_nb_output_cells( file_path, env.config["jupyter_execute_notebooks"] ) if not has_outputs: LOGGER.info("Executing: {}".format(env.docname)) ntbk = execute(ntbk, cwd=Path(file_path).parent) else: LOGGER.info( "Did not execute {}. " "Set jupyter_execute_notebooks to `force` to execute".format( env.docname ) ) return ntbk cache_base = get_cache(path_cache) # Use relpath here in case Sphinx is building from a non-parent folder r_file_path = Path(os.path.relpath(file_path, Path().resolve())) try: _, ntbk = cache_base.merge_match_into_notebook(ntbk) except KeyError: message = ( f"Couldn't find cache key for notebook file {str(r_file_path)}. " "Outputs will not be inserted." ) try: stage_record = cache_base.get_staged_record(file_path) except KeyError: stage_record = None if stage_record and stage_record.traceback: # save the traceback to a log file ensuredir(reports_dir) file_name = os.path.splitext(r_file_path.name)[0] full_path = reports_dir + "/{}.log".format(file_name) with open(full_path, "w", encoding="utf8") as log_file: log_file.write(stage_record.traceback) message += "\n Last execution failed with traceback saved in {}".format( full_path ) if show_traceback: message += "\n" + stage_record.traceback LOGGER.error(message) # This is a 'fix' for jupyter_sphinx, which requires this value for dumping the # script file, to stop it from raising an exception if not found: # Normally it would be added from the executed notebook but, # since we are already logging an error, we don't want to block the whole build. # So here we just add a dummy .txt extension if "language_info" not in ntbk.metadata: ntbk.metadata["language_info"] = nbf.from_dict({"file_extension": ".txt"}) else: LOGGER.verbose("Merged cached outputs into %s", str(r_file_path)) return ntbk
def execution_cache(app, builder, added, changed, removed): """ If caching is required, stages and executes the added or modified notebooks, and caches them for further use. """ jupyter_cache = False # all the added and changed notebooks should be operated on. # note docnames are paths relative to the sphinx root folder, with no extensions altered_docnames = added.union(changed) if app.config["jupyter_execute_notebooks"] not in ["force", "auto", "cache", "off"]: LOGGER.error( "Conf jupyter_execute_notebooks can either be `force`, `auto`, `cache` or `off`" # noqa: E501 ) exit(1) jupyter_cache = app.config["jupyter_cache"] exec_docnames = [ docname for docname in altered_docnames if is_valid_exec_file(app.env, docname) ] LOGGER.verbose("MyST-NB: Potential docnames to execute: %s", exec_docnames) if "cache" in app.config["jupyter_execute_notebooks"]: if jupyter_cache: if os.path.isdir(jupyter_cache): path_cache = jupyter_cache else: LOGGER.error( f"Path to jupyter_cache is not a directory: {jupyter_cache}" ) exit(1) else: path_cache = Path(app.outdir).parent.joinpath(".jupyter_cache") app.env.path_cache = str( path_cache ) # TODO: is there a better way to make it accessible? cache_base = get_cache(path_cache) for path in removed: docpath = app.env.doc2path(path) # there is an issue in sphinx doc2path, whereby if the path does not # exist then it will be assigned the default source_suffix (usually .rst) # therefore, to be safe here, we run through all possible suffixes for suffix in app.env.allowed_nb_exec_suffixes: docpath = os.path.splitext(docpath)[0] + suffix if not os.path.exists(docpath): cache_base.discard_staged_notebook(docpath) _stage_and_execute( app.env, exec_docnames, path_cache, app.config["execution_timeout"] ) elif jupyter_cache: LOGGER.error( "If using conf jupyter_cache, please set jupyter_execute_notebooks" # noqa: E501 " to `cache`" ) exit(1) return altered_docnames
def change_cache_limit(cache_path, limit): """Change the maximum number of notebooks stored in the cache.""" db = get_cache(cache_path) db.change_cache_limit(limit) click.secho("Cache limit changed!", fg="green")