def nb_to_tokens( ntbk: nbf.NotebookNode, config: MdParserConfig, renderer_plugin: str) -> Tuple[MarkdownIt, AttrDict, List[Token]]: """Parse the notebook content to a list of syntax tokens and an env, containing global data like reference definitions. """ md = default_parser(config) # setup the markdown parser # Note we disable front matter parsing, # because this is taken from the actual notebook metadata md.disable("front_matter", ignoreInvalid=True) md.renderer = SphinxNBRenderer(md) # make a sandbox where all the parsing global data, # like reference definitions will be stored env = AttrDict() rules = md.core.ruler.get_active_rules() # First only run pre-inline chains # so we can collect all reference definitions, etc, before assessing references def parse_block(src, start_line): with md.reset_rules(): # enable only rules up to block md.core.ruler.enableOnly(rules[:rules.index("inline")]) tokens = md.parse(src, env) for token in tokens: if token.map: token.map = [ start_line + token.map[0], start_line + token.map[1] ] for dup_ref in env.get("duplicate_refs", []): if "fixed" not in dup_ref: dup_ref["map"] = [ start_line + dup_ref["map"][0], start_line + dup_ref["map"][1], ] dup_ref["fixed"] = True return tokens block_tokens = [] source_map = ntbk.metadata.get("source_map", None) # get language lexer name langinfo = ntbk.metadata.get("language_info", {}) lexer = langinfo.get("pygments_lexer", langinfo.get("name", None)) # TODO log warning if lexer is still None for cell_index, nb_cell in enumerate(ntbk.cells): # if the the source_map has been stored (for text-based notebooks), # we use that do define the starting line for each cell # otherwise, we set a pseudo base that represents the cell index start_line = source_map[cell_index] if source_map else (cell_index + 1) * 10000 start_line += 1 # use base 1 rather than 0 # Skip empty cells if len(nb_cell["source"].strip()) == 0: continue # skip cells tagged for removal # TODO this logic should be deferred to a transform tags = nb_cell.metadata.get("tags", []) if ("remove_cell" in tags) or ("remove-cell" in tags): continue if nb_cell["cell_type"] == "markdown": # we add the cell index to tokens, # so they can be included in the error logging, block_tokens.extend(parse_block(nb_cell["source"], start_line)) elif nb_cell["cell_type"] == "code": # here we do nothing but store the cell as a custom token block_tokens.append( Token( "nb_code_cell", "", 0, meta={ "cell": nb_cell, "lexer": lexer, "renderer": renderer_plugin }, map=[start_line, start_line], )) # Now all definitions have been gathered, # we run inline and post-inline chains, to expand the text. # Note we assume here that these rules never require the actual source text, # only acting on the existing tokens state = StateCore(None, md, env, block_tokens) with md.reset_rules(): md.core.ruler.enableOnly(rules[rules.index("inline"):]) md.core.process(state) # Add the front matter. # Note that myst_parser serialises dict/list like keys, when rendering to # docutils docinfo. These could be read back with `json.loads`. state.tokens = [ Token("front_matter", "", 0, content=({k: v for k, v in ntbk.metadata.items()})) ] + state.tokens # If there are widgets, this will embed the state of all widgets in a script if contains_widgets(ntbk): state.tokens.append( Token("jupyter_widget_state", "", 0, meta={"state": get_widgets(ntbk)})) return md, env, state.tokens
def parse(self, inputstring, document): # de-serialize the notebook ntbk = nbf.reads(inputstring, nbf.NO_CONVERT) # This is a contaner for top level markdown tokens # which we will add to as we walk the document mkdown_tokens = [] # type: list[BlockToken] # First we ensure that we are using a 'clean' global context # for parsing, which is setup with the MyST parsing tokens # the logger will report on duplicate link/footnote definitions, etc parse_context = ParseContext( find_blocks=SphinxNBRenderer.default_block_tokens, find_spans=SphinxNBRenderer.default_span_tokens, logger=SPHINX_LOGGER, ) set_parse_context(parse_context) for cell_index, nb_cell in enumerate(ntbk.cells): # Skip empty cells if len(nb_cell["source"].strip()) == 0: continue # skip cells tagged for removal tags = nb_cell.metadata.get("tags", []) if "remove_cell" in tags: continue if nb_cell["cell_type"] == "markdown": # we add the document path and cell index # to the source lines, so they can be included in the error logging # NOTE: currently the logic to report metadata is not written # into SphinxRenderer, but this will be introduced in a later update lines = SourceLines( nb_cell["source"], uri=document["source"], metadata={"cell_index": cell_index}, standardize_ends=True, ) # parse the source markdown text; # at this point span/inline level tokens are not yet processed, but # link/footnote definitions are collected/stored in the global context mkdown_tokens.extend(tokenize_block(lines)) # TODO for md cells, think of a way to implement the previous # `if "hide_input" in tags:` logic elif nb_cell["cell_type"] == "code": # here we do nothing but store the cell as a custom token mkdown_tokens.append( NbCodeCell( cell=nb_cell, position=Position( line_start=0, uri=document["source"], data={"cell_index": cell_index}, ), )) # Now all definitions have been gathered, we walk the tokens and # process any inline text for token in mkdown_tokens + list( get_parse_context().foot_definitions.values()): token.expand_spans() # If there are widgets, this will embed the state of all widgets in a script if contains_widgets(ntbk): mkdown_tokens.insert(0, JupyterWidgetState(state=get_widgets(ntbk))) # create the front matter token front_matter = FrontMatter(content=ntbk.metadata, position=None) # Finally, we create the top-level markdown document markdown_doc = Document( children=mkdown_tokens, front_matter=front_matter, link_definitions=parse_context.link_definitions, footnotes=parse_context.foot_definitions, footref_order=parse_context.foot_references, ) self.reporter = document.reporter self.config = self.default_config.copy() try: new_cfg = document.settings.env.config.myst_config self.config.update(new_cfg) except AttributeError: pass # Remove all the mime prefixes from "glue" step. # This way, writing properly captures the glued images replace_mime = [] for cell in ntbk.cells: if hasattr(cell, "outputs"): for out in cell.outputs: if "data" in out: # Only do the mimebundle replacing for the scrapbook outputs mime_prefix = (out.get("metadata", {}).get("scrapbook", {}).get("mime_prefix")) if mime_prefix: out["data"] = { key.replace(mime_prefix, ""): val for key, val in out["data"].items() } replace_mime.append(out) # Write the notebook's output to disk. This changes metadata in notebook cells path_doc = Path(document.settings.env.docname) doc_relpath = path_doc.parent doc_filename = path_doc.name build_dir = Path(document.settings.env.app.outdir).parent output_dir = build_dir.joinpath("jupyter_execute", doc_relpath) write_notebook_output(ntbk, str(output_dir), doc_filename) # Now add back the mime prefixes to the right outputs so they aren't rendered # until called from the role/directive for out in replace_mime: out["data"] = { f"{GLUE_PREFIX}{key}": val for key, val in out["data"].items() } # Update our glue key list with new ones defined in this page glue_domain = NbGlueDomain.from_env(document.settings.env) glue_domain.add_notebook(ntbk, path_doc) # render the Markdown AST to docutils AST renderer = SphinxNBRenderer(parse_context=parse_context, document=document, current_node=None) renderer.render(markdown_doc)