def read(cls, lines: SourceLines):
        start_line = lines.lineno + 1

        next(lines)  # skip first ``---``
        line_buffer = []
        next_line = lines.peek()
        while not (next_line is None or next_line.startswith("---")):
            line_buffer.append(next(lines))
            next_line = lines.peek()

        if next_line is not None:
            next(lines)  # move past closing ``---``
            log_warning = False
        else:
            log_warning = True

        position = Position.from_source_lines(lines, start_line=start_line)
        if log_warning:
            get_parse_context().logger.warning(
                "{} No closing `---` was found for initial metadata block".format(
                    position.make_loc_str()
                )
            )

        return cls(content="".join(line_buffer), position=position)
Example #2
0
 def inline_text(self, text: str, lineno: int):
     # TODO return messages?
     messages = []
     paragraph = nodes.paragraph("")
     # here we instatiate a new renderer,
     # so that the nested parse does not effect the current renderer,
     # but we use the same global parse context, so that link references, etc
     # are added to the global parse.
     renderer = self._renderer.__class__(
         document=self.document,
         current_node=paragraph,
         parse_context=get_parse_context(),
     )
     lines = SourceLines(
         text,
         start_line=self._lineno,
         uri=self.document["source"],
         metadata=self._token.position.data,
         standardize_ends=True,
     )
     doc_token = myst_block_tokens.Document.read(
         lines, front_matter=False, reset_definitions=False
     )
     # we mark the token as nested so that footnotes etc aren't rendered
     doc_token.is_nested = True
     renderer.render(doc_token)
     textnodes = []
     if paragraph.children:
         # first child should be paragraph
         textnodes = paragraph.children[0].children
     return textnodes, messages
Example #3
0
def tokenize_main(
        lines: SourceLines,
        token_types=None,
        expand_spans: bool = True,
        skip_tokens: list = ("LinkDefinition", "Footnote"),
):
    """Searches for token_types in an iterable.

    :param lines: the source lines
    :param token_types: override block-level tokens set in global context
    :param start_line: the source line number corresponding to `iterable[0]`
    :param expand_spans: After the initial parse the span text is not yet tokenized,
        but stored instead as raw text in `SpanContainer`, in order to ensure
        all link definitons are read first. Setting True, runs a second walk of the
        syntax tree to replace these `SpanContainer` with the final span tokens.
    :param skip_tokens: do not store these ``token.name`` in the syntax tree.
        These are usually tokens that store themselves in the global context

    :returns: list of block-level token instances.
    """
    if not isinstance(lines, SourceLines):
        lines = SourceLines(lines)
    if token_types is None:
        token_types = get_parse_context().block_tokens
    tokens = tokenize_block(lines,
                            token_types=token_types,
                            skip_tokens=skip_tokens)
    if expand_spans:
        for token in tokens + list(
                get_parse_context().foot_definitions.values()):
            token.expand_spans()
    return tokens
Example #4
0
def test_fenced_code(name, source, data_regression):
    from mistletoe.base_elements import SourceLines

    print(SourceLines(source).lines)
    data_regression.check(
        serialize_tokens(tokenize_main(source), as_dict=True),
        basename=f"test_fenced_code_{name}",
    )
 def read(cls, lines: SourceLines):
     line_buffer = []
     start_line = lines.lineno + 1
     next_line = lines.peek()
     while next_line is not None and next_line.strip() != "":
         line_buffer.append(next(lines))
         next_line = lines.peek()
     string = "".join(line_buffer)
     offset = 0
     matches = []
     while offset < len(string) - 1:
         match_info = cls.match_reference(lines, string, offset)
         if match_info is None:
             break
         offset, match = match_info
         matches.append(match)
     position = Position.from_source_lines(lines, start_line=start_line)
     cls.append_link_definitions(matches, position)
     return cls(position=position, definitions=matches) if matches else None
    def read(cls, lines):
        # first line
        start_line = lines.lineno + 1
        line = cls.convert_leading_tabs(next(lines).lstrip()).split(">", 1)[1]
        if len(line) > 0 and line[0] == " ":
            line = line[1:]
        line_buffer = [line]

        # set booleans
        in_code_fence = CodeFence.start(line)
        in_block_code = BlockCode.start(line)
        blank_line = line.strip() == ""

        # loop
        next_line = lines.peek()
        while not cls.transition(next_line):
            stripped = cls.convert_leading_tabs(next_line.lstrip())
            prepend = 0
            if stripped[0] == ">":
                # has leader, not lazy continuation
                prepend += 1
                if stripped[1] == " ":
                    prepend += 1
                stripped = stripped[prepend:]
                in_code_fence = CodeFence.start(stripped)
                in_block_code = BlockCode.start(stripped)
                blank_line = stripped.strip() == ""
                line_buffer.append(stripped)
            elif in_code_fence or in_block_code or blank_line:
                # not paragraph continuation text
                break
            else:
                # lazy continuation, preserve whitespace
                line_buffer.append(next_line)
            next(lines)
            next_line = lines.peek()

        # block level tokens are parsed here, so that link_definitions
        # in quotes can be recognized before span-level tokenizing.
        Paragraph.parse_setext = False
        try:
            child_tokens = tokenizer.tokenize_block(
                SourceLines(line_buffer, start_line=start_line)
            )
        finally:
            Paragraph.parse_setext = True
        return cls(
            children=child_tokens,
            position=Position.from_source_lines(lines, start_line=start_line),
        )
Example #7
0
def tokenize_block(lines: SourceLines,
                   token_types=None,
                   skip_tokens=("LinkDefinition", "Footnote")):
    """Returns a list of parsed tokens."""
    assert isinstance(lines,
                      SourceLines), "lines must be `SourceLines` instance"
    if token_types is None:
        token_types = get_parse_context().block_tokens
    parsed_tokens = ParseBuffer()
    line = lines.peek()
    while line is not None:
        for token_type in token_types:
            if token_type.start(line):
                token = token_type.read(lines)
                if token is not None:
                    if token.name not in skip_tokens:
                        parsed_tokens.append(token)
                    break
        else:  # unmatched newlines
            next(lines)
            parsed_tokens.loose = True
        line = lines.peek()
    return parsed_tokens
Example #8
0
 def nested_render_text(self, text: str, lineno: int, token):
     """Render unparsed text."""
     lines = SourceLines(
         text,
         start_line=lineno,
         uri=self.document["source"],
         metadata=token.position.data,
         standardize_ends=True,
     )
     doc_token = myst_block_tokens.Document.read(
         lines, front_matter=True, reset_definitions=False
     )
     # TODO think if this is the best way: here we consume front matter,
     # but then remove it. this is for example if includes have front matter
     doc_token.front_matter = None
     # we mark the token as nested so that footnotes etc aren't rendered
     doc_token.is_nested = True
     self.render(doc_token)
    def read(
        cls,
        lines: Union[str, ListType[str], SourceLines],
        reset_definitions: bool = True,
        skip_tokens: list = ("LinkDefinition", "Footnote"),
        front_matter: bool = False,
    ):
        """Read a document

        :param lines: Lines to parse
        :param reset_definitions: remove any previously stored definitions
            in the global context (see ``ParseContext.reset_definitions()``).
        :param skip_tokens: do not store these ``token.name`` in the syntax tree.
            These are usually tokens that store themselves in the global context.
        :param front_matter: search for an initial YAML block front matter block
            (note this is not strictly CommonMark compliant)
        """
        if reset_definitions:
            get_parse_context().reset_definitions()

        if not isinstance(lines, SourceLines):
            lines = SourceLines(lines, standardize_ends=True)

        # TODO can we do this in a way where we are checking
        # FrontMatter in get_parse_context().block_tokens?
        # then it would be easier to add/remove it in the renderers
        front_matter_token = None
        if front_matter and lines.peek() and lines.peek().startswith("---"):
            front_matter_token = FrontMatter.read(lines)

        children = tokenizer.tokenize_main(lines=lines, skip_tokens=skip_tokens)
        foot_defs = get_parse_context().foot_definitions
        return cls(
            children=children,
            front_matter=front_matter_token,
            link_definitions=get_parse_context().link_definitions,
            footnotes=foot_defs,
            footref_order=[
                t for t in get_parse_context().foot_references if t in foot_defs
            ],
        )
Example #10
0
    def parse(self, inputstring: str, document: nodes.document):
        """Parse source text.

        :param inputstring: The source string to parse
        :param document: The root docutils node to add AST elements to
        """
        # TODO add conf.py configurable settings
        self.config = self.default_config.copy()
        try:
            new_cfg = self.document.settings.env.config.myst_config
            self.config.update(new_cfg)
        except AttributeError:
            pass
        renderer = SphinxRenderer(document=document)
        with renderer:
            # Log to sphinx (e.g. to warn of duplicate link/footnote definitions)
            renderer.parse_context.logger = SPHINX_LOGGER
            lines = SourceLines(inputstring,
                                uri=document["source"],
                                standardize_ends=True)
            doc = Document.read(lines)
            renderer.render(doc)
    def read(cls, lines, prev_marker=None):
        next_marker = None
        lines.anchor()
        prepend = -1
        leader = None
        start_line = lines.lineno
        line_buffer = []

        # first line
        line = next(lines)
        prepend, leader = prev_marker if prev_marker else cls.parse_marker(line)
        line = line.replace(leader + "\t", leader + "   ", 1).replace("\t", "    ")
        empty_first_line = line[prepend:].strip() == ""
        if not empty_first_line:
            line_buffer.append(line[prepend:])
        next_line = lines.peek()
        if empty_first_line and next_line is not None and next_line.strip() == "":
            child_tokens = tokenizer.tokenize_block(
                SourceLines([next(lines)], start_line=lines.lineno)
            )
            next_line = lines.peek()
            if next_line is not None:
                marker_info = cls.parse_marker(next_line)
                if marker_info is not None:
                    next_marker = marker_info
            return cls(
                children=child_tokens,
                loose=child_tokens.loose,
                prepend=prepend,
                leader=leader,
                next_marker=next_marker,
                position=Position.from_source_lines(lines, start_line=start_line),
            )

        # loop
        newline = 0
        while True:
            # no more lines
            if next_line is None:
                # strip off newlines
                if newline:
                    lines.backstep()
                    del line_buffer[-newline:]
                break
            next_line = next_line.replace("\t", "    ")
            # not in continuation
            if not cls.in_continuation(next_line, prepend):
                # directly followed by another token
                if cls.transition(next_line):
                    if newline:
                        lines.backstep()
                        del line_buffer[-newline:]
                    break
                # next_line is a new list item
                marker_info = cls.parse_marker(next_line)
                if marker_info is not None:
                    next_marker = marker_info
                    break
                # not another item, has newlines -> not continuation
                if newline:
                    lines.backstep()
                    del line_buffer[-newline:]
                    break
            next(lines)
            line = next_line
            stripped = line.lstrip(" ")
            diff = len(line) - len(stripped)
            if diff > prepend:
                stripped = " " * (diff - prepend) + stripped
            line_buffer.append(stripped)
            newline = newline + 1 if next_line.strip() == "" else 0
            next_line = lines.peek()

        child_tokens = tokenizer.tokenize_block(
            SourceLines(line_buffer, start_line=start_line)
        )

        return cls(
            children=child_tokens,
            loose=child_tokens.loose,
            prepend=prepend,
            leader=leader,
            next_marker=next_marker,
            position=Position.from_source_lines(lines, start_line=start_line),
        )
 def test_render_heading(self):
     renderer = TOCRenderer()
     Heading.start("### some *text*\n")
     token = Heading.read(SourceLines(["foo"]), expand_spans=True)
     renderer.render_heading(token)
     self.assertEqual(renderer._headings[0], (3, "some text"))
Example #13
0
def myst_to_notebook(
    text,
    code_directive=CODE_DIRECTIVE,
    raw_directive=RAW_DIRECTIVE,
    ignore_bad_meta=False,
    store_line_numbers=False,
):
    """Convert text written in the myst format to a notebook.

    :param text: the file text
    :param code_directive: the name of the directive to search for containing code cells
    :param raw_directive: the name of the directive to search for containing raw cells
    :param ignore_bad_meta: ignore metadata that cannot be parsed as JSON/YAML
    :param store_line_numbers: add a `_source_lines` key to cell metadata,
        mapping to the source text.

    NOTE: we assume here that all of these directives are at the top-level,
    i.e. not nested in other directives.
    """
    from mistletoe.base_elements import SourceLines
    from mistletoe.parse_context import (
        ParseContext,
        get_parse_context,
        set_parse_context,
    )
    from mistletoe.block_tokens import Document, CodeFence

    from myst_parser.block_tokens import BlockBreak
    from myst_parser.parse_directives import DirectiveParsingError, parse_directive_text
    from myst_parser.docutils_renderer import DocutilsRenderer

    code_directive = "{{{0}}}".format(code_directive)
    raw_directive = "{{{0}}}".format(raw_directive)

    original_context = get_parse_context()
    parse_context = ParseContext(
        find_blocks=DocutilsRenderer.default_block_tokens,
        find_spans=DocutilsRenderer.default_span_tokens,
    )

    if isinstance(text, SourceLines):
        lines = text
    else:
        lines = SourceLines(text, standardize_ends=True)

    try:
        set_parse_context(parse_context)
        doc = Document.read(lines, front_matter=True)
        metadata_nb = {}
        try:
            metadata_nb = doc.front_matter.get_data() if doc.front_matter else {}
        except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
            if not ignore_bad_meta:
                raise MystMetadataParsingError("Notebook metadata: {}".format(error))
        nbf_version = nbf.v4
        kwargs = {"metadata": nbf.from_dict(metadata_nb)}
        notebook = nbf_version.new_notebook(**kwargs)

        current_line = 0 if not doc.front_matter else doc.front_matter.position.line_end
        md_metadata = {}

        for item in doc.walk(["CodeFence", "BlockBreak"]):
            if isinstance(item.node, BlockBreak):
                token = item.node  # type: BlockBreak
                source = _fmt_md(
                    "".join(lines.lines[current_line:token.position.line_start - 1])
                )
                if source:
                    md_metadata = nbf.from_dict(md_metadata)
                    if store_line_numbers:
                        md_metadata["_source_lines"] = [
                            current_line,
                            token.position.line_start - 1,
                        ]
                    notebook.cells.append(
                        nbf_version.new_markdown_cell(
                            source=source, metadata=md_metadata,
                        )
                    )
                if token.content:
                    md_metadata = {}
                    try:
                        md_metadata = json.loads(token.content.strip())
                    except Exception as err:
                        if not ignore_bad_meta:
                            raise MystMetadataParsingError(
                                "markdown cell {0} at {1} could not be read: {2}".format(
                                    len(notebook.cells) + 1, token.position, err
                                )
                            )
                    if not isinstance(md_metadata, dict):
                        if not ignore_bad_meta:
                            raise MystMetadataParsingError(
                                "markdown cell {0} at {1} is not a dict".format(
                                    len(notebook.cells) + 1, token.position
                                )
                            )
                else:
                    md_metadata = {}
                current_line = token.position.line_start
            if isinstance(item.node, CodeFence) and item.node.language in [
                code_directive,
                raw_directive,
            ]:
                token = item.node  # type: CodeFence
                # Note: we ignore anything after the directive on the first line
                # this is reserved for the optional lexer name
                # TODO: could log warning about if token.arguments != lexer name

                options, body_lines = {}, []
                try:
                    _, options, body_lines = parse_directive_text(
                        directive_class=MockDirective,
                        argument_str="",
                        content=token.children[0].content,
                        validate_options=False,
                    )
                except DirectiveParsingError as err:
                    if not ignore_bad_meta:
                        raise MystMetadataParsingError(
                            "Code cell {0} at {1} could not be read: {2}".format(
                                len(notebook.cells) + 1, token.position, err
                            )
                        )

                md_source = _fmt_md(
                    "".join(lines.lines[current_line:token.position.line_start - 1])
                )
                if md_source:
                    md_metadata = nbf.from_dict(md_metadata)
                    if store_line_numbers:
                        md_metadata["_source_lines"] = [
                            current_line,
                            token.position.line_start - 1,
                        ]
                    notebook.cells.append(
                        nbf_version.new_markdown_cell(
                            source=md_source, metadata=md_metadata,
                        )
                    )
                current_line = token.position.line_end
                md_metadata = {}

                cell_metadata = nbf.from_dict(options)
                if store_line_numbers:
                    cell_metadata["_source_lines"] = [
                        token.position.line_start,
                        token.position.line_end,
                    ]
                if item.node.language == code_directive:
                    notebook.cells.append(
                        nbf_version.new_code_cell(
                            source="\n".join(body_lines), metadata=cell_metadata,
                        )
                    )
                if item.node.language == raw_directive:
                    notebook.cells.append(
                        nbf_version.new_raw_cell(
                            source="\n".join(body_lines), metadata=cell_metadata,
                        )
                    )

        # add the final markdown cell (if present)
        if lines.lines[current_line:]:
            md_metadata = nbf.from_dict(md_metadata)
            if store_line_numbers:
                md_metadata["_source_lines"] = [current_line, len(lines.lines)]
            notebook.cells.append(
                nbf_version.new_markdown_cell(
                    source=_fmt_md("".join(lines.lines[current_line:])),
                    metadata=md_metadata,
                )
            )

    finally:
        set_parse_context(original_context)

    return notebook
Example #14
0
    def parse(self, inputstring, document):

        # de-serialize the notebook
        ntbk = nbf.reads(inputstring, nbf.NO_CONVERT)

        # This is a contaner for top level markdown tokens
        # which we will add to as we walk the document
        mkdown_tokens = []  # type: list[BlockToken]

        # First we ensure that we are using a 'clean' global context
        # for parsing, which is setup with the MyST parsing tokens
        # the logger will report on duplicate link/footnote definitions, etc
        parse_context = ParseContext(
            find_blocks=SphinxNBRenderer.default_block_tokens,
            find_spans=SphinxNBRenderer.default_span_tokens,
            logger=SPHINX_LOGGER,
        )
        set_parse_context(parse_context)

        for cell_index, nb_cell in enumerate(ntbk.cells):

            # Skip empty cells
            if len(nb_cell["source"].strip()) == 0:
                continue

            # skip cells tagged for removal
            tags = nb_cell.metadata.get("tags", [])
            if "remove_cell" in tags:
                continue

            if nb_cell["cell_type"] == "markdown":

                # we add the document path and cell index
                # to the source lines, so they can be included in the error logging
                # NOTE: currently the logic to report metadata is not written
                # into SphinxRenderer, but this will be introduced in a later update
                lines = SourceLines(
                    nb_cell["source"],
                    uri=document["source"],
                    metadata={"cell_index": cell_index},
                    standardize_ends=True,
                )

                # parse the source markdown text;
                # at this point span/inline level tokens are not yet processed, but
                # link/footnote definitions are collected/stored in the global context
                mkdown_tokens.extend(tokenize_block(lines))

                # TODO for md cells, think of a way to implement the previous
                # `if "hide_input" in tags:` logic

            elif nb_cell["cell_type"] == "code":
                # here we do nothing but store the cell as a custom token
                mkdown_tokens.append(
                    NbCodeCell(
                        cell=nb_cell,
                        position=Position(
                            line_start=0,
                            uri=document["source"],
                            data={"cell_index": cell_index},
                        ),
                    ))

        # Now all definitions have been gathered, we walk the tokens and
        # process any inline text
        for token in mkdown_tokens + list(
                get_parse_context().foot_definitions.values()):
            token.expand_spans()

        # If there are widgets, this will embed the state of all widgets in a script
        if contains_widgets(ntbk):
            mkdown_tokens.insert(0,
                                 JupyterWidgetState(state=get_widgets(ntbk)))

        # create the front matter token
        front_matter = FrontMatter(content=ntbk.metadata, position=None)

        # Finally, we create the top-level markdown document
        markdown_doc = Document(
            children=mkdown_tokens,
            front_matter=front_matter,
            link_definitions=parse_context.link_definitions,
            footnotes=parse_context.foot_definitions,
            footref_order=parse_context.foot_references,
        )

        self.reporter = document.reporter
        self.config = self.default_config.copy()
        try:
            new_cfg = document.settings.env.config.myst_config
            self.config.update(new_cfg)
        except AttributeError:
            pass

        # Remove all the mime prefixes from "glue" step.
        # This way, writing properly captures the glued images
        replace_mime = []
        for cell in ntbk.cells:
            if hasattr(cell, "outputs"):
                for out in cell.outputs:
                    if "data" in out:
                        # Only do the mimebundle replacing for the scrapbook outputs
                        mime_prefix = (out.get("metadata",
                                               {}).get("scrapbook",
                                                       {}).get("mime_prefix"))
                        if mime_prefix:
                            out["data"] = {
                                key.replace(mime_prefix, ""): val
                                for key, val in out["data"].items()
                            }
                            replace_mime.append(out)

        # Write the notebook's output to disk. This changes metadata in notebook cells
        path_doc = Path(document.settings.env.docname)
        doc_relpath = path_doc.parent
        doc_filename = path_doc.name
        build_dir = Path(document.settings.env.app.outdir).parent
        output_dir = build_dir.joinpath("jupyter_execute", doc_relpath)
        write_notebook_output(ntbk, str(output_dir), doc_filename)

        # Now add back the mime prefixes to the right outputs so they aren't rendered
        # until called from the role/directive
        for out in replace_mime:
            out["data"] = {
                f"{GLUE_PREFIX}{key}": val
                for key, val in out["data"].items()
            }

        # Update our glue key list with new ones defined in this page
        glue_domain = NbGlueDomain.from_env(document.settings.env)
        glue_domain.add_notebook(ntbk, path_doc)

        # render the Markdown AST to docutils AST
        renderer = SphinxNBRenderer(parse_context=parse_context,
                                    document=document,
                                    current_node=None)
        renderer.render(markdown_doc)