Exemple #1
0
def test_noneState():
    md = MarkdownIt()
    state = StateCore(None, md, {}, [])

    # Remove normalizing rule
    rules = md.core.ruler.get_active_rules()
    md.core.ruler.enableOnly(rules[rules.index("inline"):])

    # Check that we can process None str with empty env and block_tokens
    md.core.process(state)
Exemple #2
0
def nb_to_tokens(
        ntbk: nbf.NotebookNode, config: MdParserConfig,
        renderer_plugin: str) -> Tuple[MarkdownIt, AttrDict, List[Token]]:
    """Parse the notebook content to a list of syntax tokens and an env,
    containing global data like reference definitions.
    """
    md = default_parser(config)
    # setup the markdown parser
    # Note we disable front matter parsing,
    # because this is taken from the actual notebook metadata
    md.disable("front_matter", ignoreInvalid=True)
    md.renderer = SphinxNBRenderer(md)
    # make a sandbox where all the parsing global data,
    # like reference definitions will be stored
    env = AttrDict()
    rules = md.core.ruler.get_active_rules()

    # First only run pre-inline chains
    # so we can collect all reference definitions, etc, before assessing references
    def parse_block(src, start_line):
        with md.reset_rules():
            # enable only rules up to block
            md.core.ruler.enableOnly(rules[:rules.index("inline")])
            tokens = md.parse(src, env)
        for token in tokens:
            if token.map:
                token.map = [
                    start_line + token.map[0], start_line + token.map[1]
                ]
        for dup_ref in env.get("duplicate_refs", []):
            if "fixed" not in dup_ref:
                dup_ref["map"] = [
                    start_line + dup_ref["map"][0],
                    start_line + dup_ref["map"][1],
                ]
                dup_ref["fixed"] = True
        return tokens

    block_tokens = []
    source_map = ntbk.metadata.get("source_map", None)

    # get language lexer name
    langinfo = ntbk.metadata.get("language_info", {})
    lexer = langinfo.get("pygments_lexer", langinfo.get("name", None))
    # TODO log warning if lexer is still None

    for cell_index, nb_cell in enumerate(ntbk.cells):

        # if the the source_map has been stored (for text-based notebooks),
        # we use that do define the starting line for each cell
        # otherwise, we set a pseudo base that represents the cell index
        start_line = source_map[cell_index] if source_map else (cell_index +
                                                                1) * 10000
        start_line += 1  # use base 1 rather than 0

        # Skip empty cells
        if len(nb_cell["source"].strip()) == 0:
            continue

        # skip cells tagged for removal
        # TODO this logic should be deferred to a transform
        tags = nb_cell.metadata.get("tags", [])
        if ("remove_cell" in tags) or ("remove-cell" in tags):
            continue

        if nb_cell["cell_type"] == "markdown":

            # we add the cell index to tokens,
            # so they can be included in the error logging,
            block_tokens.extend(parse_block(nb_cell["source"], start_line))

        elif nb_cell["cell_type"] == "code":
            # here we do nothing but store the cell as a custom token
            block_tokens.append(
                Token(
                    "nb_code_cell",
                    "",
                    0,
                    meta={
                        "cell": nb_cell,
                        "lexer": lexer,
                        "renderer": renderer_plugin
                    },
                    map=[start_line, start_line],
                ))

    # Now all definitions have been gathered,
    # we run inline and post-inline chains, to expand the text.
    # Note we assume here that these rules never require the actual source text,
    # only acting on the existing tokens
    state = StateCore(None, md, env, block_tokens)
    with md.reset_rules():
        md.core.ruler.enableOnly(rules[rules.index("inline"):])
        md.core.process(state)

    # Add the front matter.
    # Note that myst_parser serialises dict/list like keys, when rendering to
    # docutils docinfo. These could be read back with `json.loads`.
    state.tokens = [
        Token("front_matter",
              "",
              0,
              content=({k: v
                        for k, v in ntbk.metadata.items()}))
    ] + state.tokens

    # If there are widgets, this will embed the state of all widgets in a script
    if contains_widgets(ntbk):
        state.tokens.append(
            Token("jupyter_widget_state",
                  "",
                  0,
                  meta={"state": get_widgets(ntbk)}))

    return md, env, state.tokens
Exemple #3
0
def notebook_to_tokens(
    notebook: NotebookNode,
    mdit_parser: MarkdownIt,
    mdit_env: dict[str, Any],
    logger: LoggerType,
) -> list[Token]:
    # disable front-matter, since this is taken from the notebook
    mdit_parser.disable("front_matter", ignoreInvalid=True)
    # this stores global state, such as reference definitions

    # Parse block tokens only first, leaving inline parsing to a second phase
    # (required to collect all reference definitions, before assessing references).
    block_tokens = [Token("nb_initialise", "", 0, map=[0, 0])]
    for cell_index, nb_cell in enumerate(notebook.cells):

        # skip empty cells
        if len(nb_cell["source"].strip()) == 0:
            continue

        # skip cells tagged for removal
        tags = nb_cell.metadata.get("tags", [])
        if ("remove_cell" in tags) or ("remove-cell" in tags):
            continue

        # generate tokens
        tokens: list[Token]
        if nb_cell["cell_type"] == "markdown":
            # https://nbformat.readthedocs.io/en/5.1.3/format_description.html#markdown-cells
            # TODO if cell has tag output-caption, then use as caption for next/preceding cell?
            tokens = [
                Token(
                    "nb_cell_markdown_open",
                    "",
                    1,
                    hidden=True,
                    meta={
                        "index": cell_index,
                        "metadata": nb_node_to_dict(nb_cell["metadata"]),
                    },
                    map=[0, len(nb_cell["source"].splitlines()) - 1],
                ),
            ]
            with mdit_parser.reset_rules():
                # enable only rules up to block
                rules = mdit_parser.core.ruler.get_active_rules()
                mdit_parser.core.ruler.enableOnly(
                    rules[:rules.index("inline")])
                tokens.extend(mdit_parser.parse(nb_cell["source"], mdit_env))
            tokens.append(
                Token(
                    "nb_cell_markdown_close",
                    "",
                    -1,
                    hidden=True,
                ), )
        elif nb_cell["cell_type"] == "raw":
            # https://nbformat.readthedocs.io/en/5.1.3/format_description.html#raw-nbconvert-cells
            tokens = [
                Token(
                    "nb_cell_raw",
                    "code",
                    0,
                    content=nb_cell["source"],
                    meta={
                        "index": cell_index,
                        "metadata": nb_node_to_dict(nb_cell["metadata"]),
                    },
                    map=[0, 0],
                )
            ]
        elif nb_cell["cell_type"] == "code":
            # https://nbformat.readthedocs.io/en/5.1.3/format_description.html#code-cells
            # we don't copy the outputs here, since this would
            # greatly increase the memory consumption,
            # instead they will referenced by the cell index
            tokens = [
                Token(
                    "nb_cell_code",
                    "code",
                    0,
                    content=nb_cell["source"],
                    meta={
                        "index": cell_index,
                        "metadata": nb_node_to_dict(nb_cell["metadata"]),
                    },
                    map=[0, 0],
                )
            ]
        else:
            pass  # TODO create warning

        # update token's source lines, using either a source_map (index -> line),
        # set when converting to a notebook, or a pseudo base of the cell index
        smap = notebook.metadata.get("source_map", None)
        start_line = smap[cell_index] if smap else (cell_index + 1) * 10000
        start_line += 1  # use base 1 rather than 0
        for token in tokens:
            if token.map:
                token.map = [
                    start_line + token.map[0], start_line + token.map[1]
                ]
        # also update the source lines for duplicate references
        for dup_ref in mdit_env.get("duplicate_refs", []):
            if "fixed" not in dup_ref:
                dup_ref["map"] = [
                    start_line + dup_ref["map"][0],
                    start_line + dup_ref["map"][1],
                ]
                dup_ref["fixed"] = True

        # add tokens to list
        block_tokens.extend(tokens)

    block_tokens.append(Token("nb_finalise", "", 0, map=[0, 0]))

    # Now all definitions have been gathered, run the inline parsing phase
    state = StateCore("", mdit_parser, mdit_env, block_tokens)
    with mdit_parser.reset_rules():
        rules = mdit_parser.core.ruler.get_active_rules()
        mdit_parser.core.ruler.enableOnly(rules[rules.index("inline"):])
        mdit_parser.core.process(state)

    return state.tokens