def test_noneState(): md = MarkdownIt() state = StateCore(None, md, {}, []) # Remove normalizing rule rules = md.core.ruler.get_active_rules() md.core.ruler.enableOnly(rules[rules.index("inline"):]) # Check that we can process None str with empty env and block_tokens md.core.process(state)
def nb_to_tokens( ntbk: nbf.NotebookNode, config: MdParserConfig, renderer_plugin: str) -> Tuple[MarkdownIt, AttrDict, List[Token]]: """Parse the notebook content to a list of syntax tokens and an env, containing global data like reference definitions. """ md = default_parser(config) # setup the markdown parser # Note we disable front matter parsing, # because this is taken from the actual notebook metadata md.disable("front_matter", ignoreInvalid=True) md.renderer = SphinxNBRenderer(md) # make a sandbox where all the parsing global data, # like reference definitions will be stored env = AttrDict() rules = md.core.ruler.get_active_rules() # First only run pre-inline chains # so we can collect all reference definitions, etc, before assessing references def parse_block(src, start_line): with md.reset_rules(): # enable only rules up to block md.core.ruler.enableOnly(rules[:rules.index("inline")]) tokens = md.parse(src, env) for token in tokens: if token.map: token.map = [ start_line + token.map[0], start_line + token.map[1] ] for dup_ref in env.get("duplicate_refs", []): if "fixed" not in dup_ref: dup_ref["map"] = [ start_line + dup_ref["map"][0], start_line + dup_ref["map"][1], ] dup_ref["fixed"] = True return tokens block_tokens = [] source_map = ntbk.metadata.get("source_map", None) # get language lexer name langinfo = ntbk.metadata.get("language_info", {}) lexer = langinfo.get("pygments_lexer", langinfo.get("name", None)) # TODO log warning if lexer is still None for cell_index, nb_cell in enumerate(ntbk.cells): # if the the source_map has been stored (for text-based notebooks), # we use that do define the starting line for each cell # otherwise, we set a pseudo base that represents the cell index start_line = source_map[cell_index] if source_map else (cell_index + 1) * 10000 start_line += 1 # use base 1 rather than 0 # Skip empty cells if len(nb_cell["source"].strip()) == 0: continue # skip cells tagged for removal # TODO this logic should be deferred to a transform tags = nb_cell.metadata.get("tags", []) if ("remove_cell" in tags) or ("remove-cell" in tags): continue if nb_cell["cell_type"] == "markdown": # we add the cell index to tokens, # so they can be included in the error logging, block_tokens.extend(parse_block(nb_cell["source"], start_line)) elif nb_cell["cell_type"] == "code": # here we do nothing but store the cell as a custom token block_tokens.append( Token( "nb_code_cell", "", 0, meta={ "cell": nb_cell, "lexer": lexer, "renderer": renderer_plugin }, map=[start_line, start_line], )) # Now all definitions have been gathered, # we run inline and post-inline chains, to expand the text. # Note we assume here that these rules never require the actual source text, # only acting on the existing tokens state = StateCore(None, md, env, block_tokens) with md.reset_rules(): md.core.ruler.enableOnly(rules[rules.index("inline"):]) md.core.process(state) # Add the front matter. # Note that myst_parser serialises dict/list like keys, when rendering to # docutils docinfo. These could be read back with `json.loads`. state.tokens = [ Token("front_matter", "", 0, content=({k: v for k, v in ntbk.metadata.items()})) ] + state.tokens # If there are widgets, this will embed the state of all widgets in a script if contains_widgets(ntbk): state.tokens.append( Token("jupyter_widget_state", "", 0, meta={"state": get_widgets(ntbk)})) return md, env, state.tokens
def notebook_to_tokens( notebook: NotebookNode, mdit_parser: MarkdownIt, mdit_env: dict[str, Any], logger: LoggerType, ) -> list[Token]: # disable front-matter, since this is taken from the notebook mdit_parser.disable("front_matter", ignoreInvalid=True) # this stores global state, such as reference definitions # Parse block tokens only first, leaving inline parsing to a second phase # (required to collect all reference definitions, before assessing references). block_tokens = [Token("nb_initialise", "", 0, map=[0, 0])] for cell_index, nb_cell in enumerate(notebook.cells): # skip empty cells if len(nb_cell["source"].strip()) == 0: continue # skip cells tagged for removal tags = nb_cell.metadata.get("tags", []) if ("remove_cell" in tags) or ("remove-cell" in tags): continue # generate tokens tokens: list[Token] if nb_cell["cell_type"] == "markdown": # https://nbformat.readthedocs.io/en/5.1.3/format_description.html#markdown-cells # TODO if cell has tag output-caption, then use as caption for next/preceding cell? tokens = [ Token( "nb_cell_markdown_open", "", 1, hidden=True, meta={ "index": cell_index, "metadata": nb_node_to_dict(nb_cell["metadata"]), }, map=[0, len(nb_cell["source"].splitlines()) - 1], ), ] with mdit_parser.reset_rules(): # enable only rules up to block rules = mdit_parser.core.ruler.get_active_rules() mdit_parser.core.ruler.enableOnly( rules[:rules.index("inline")]) tokens.extend(mdit_parser.parse(nb_cell["source"], mdit_env)) tokens.append( Token( "nb_cell_markdown_close", "", -1, hidden=True, ), ) elif nb_cell["cell_type"] == "raw": # https://nbformat.readthedocs.io/en/5.1.3/format_description.html#raw-nbconvert-cells tokens = [ Token( "nb_cell_raw", "code", 0, content=nb_cell["source"], meta={ "index": cell_index, "metadata": nb_node_to_dict(nb_cell["metadata"]), }, map=[0, 0], ) ] elif nb_cell["cell_type"] == "code": # https://nbformat.readthedocs.io/en/5.1.3/format_description.html#code-cells # we don't copy the outputs here, since this would # greatly increase the memory consumption, # instead they will referenced by the cell index tokens = [ Token( "nb_cell_code", "code", 0, content=nb_cell["source"], meta={ "index": cell_index, "metadata": nb_node_to_dict(nb_cell["metadata"]), }, map=[0, 0], ) ] else: pass # TODO create warning # update token's source lines, using either a source_map (index -> line), # set when converting to a notebook, or a pseudo base of the cell index smap = notebook.metadata.get("source_map", None) start_line = smap[cell_index] if smap else (cell_index + 1) * 10000 start_line += 1 # use base 1 rather than 0 for token in tokens: if token.map: token.map = [ start_line + token.map[0], start_line + token.map[1] ] # also update the source lines for duplicate references for dup_ref in mdit_env.get("duplicate_refs", []): if "fixed" not in dup_ref: dup_ref["map"] = [ start_line + dup_ref["map"][0], start_line + dup_ref["map"][1], ] dup_ref["fixed"] = True # add tokens to list block_tokens.extend(tokens) block_tokens.append(Token("nb_finalise", "", 0, map=[0, 0])) # Now all definitions have been gathered, run the inline parsing phase state = StateCore("", mdit_parser, mdit_env, block_tokens) with mdit_parser.reset_rules(): rules = mdit_parser.core.ruler.get_active_rules() mdit_parser.core.ruler.enableOnly(rules[rules.index("inline"):]) mdit_parser.core.process(state) return state.tokens