Exemple #1
0
def tokenize_main(
        lines: SourceLines,
        token_types=None,
        expand_spans: bool = True,
        skip_tokens: list = ("LinkDefinition", "Footnote"),
):
    """Searches for token_types in an iterable.

    :param lines: the source lines
    :param token_types: override block-level tokens set in global context
    :param start_line: the source line number corresponding to `iterable[0]`
    :param expand_spans: After the initial parse the span text is not yet tokenized,
        but stored instead as raw text in `SpanContainer`, in order to ensure
        all link definitons are read first. Setting True, runs a second walk of the
        syntax tree to replace these `SpanContainer` with the final span tokens.
    :param skip_tokens: do not store these ``token.name`` in the syntax tree.
        These are usually tokens that store themselves in the global context

    :returns: list of block-level token instances.
    """
    if not isinstance(lines, SourceLines):
        lines = SourceLines(lines)
    if token_types is None:
        token_types = get_parse_context().block_tokens
    tokens = tokenize_block(lines,
                            token_types=token_types,
                            skip_tokens=skip_tokens)
    if expand_spans:
        for token in tokens + list(
                get_parse_context().foot_definitions.values()):
            token.expand_spans()
    return tokens
def test_repeated_footnote(caplog):
    get_parse_context().block_tokens.insert_before(
        block_tokens_ext.Footnote, block_tokens.LinkDefinition
    )
    tokenize_main(["[^1]: value1\n", "[^1]: value2\n"])
    assert "ignoring duplicate footnote definition" in caplog.text
    assert len(get_parse_context().foot_definitions) == 1
    def read(cls, lines: SourceLines):
        start_line = lines.lineno + 1

        next(lines)  # skip first ``---``
        line_buffer = []
        next_line = lines.peek()
        while not (next_line is None or next_line.startswith("---")):
            line_buffer.append(next(lines))
            next_line = lines.peek()

        if next_line is not None:
            next(lines)  # move past closing ``---``
            log_warning = False
        else:
            log_warning = True

        position = Position.from_source_lines(lines, start_line=start_line)
        if log_warning:
            get_parse_context().logger.warning(
                "{} No closing `---` was found for initial metadata block".format(
                    position.make_loc_str()
                )
            )

        return cls(content="".join(line_buffer), position=position)
def test_foot_ref_span(name, source, data_regression):
    get_parse_context().foot_definitions["a"] = True
    _span_tokens = get_parse_context().span_tokens
    _span_tokens.insert_after(FootReference, CoreTokens)
    data_regression.check(
        serialize_tokens(tokenize_span(source), as_dict=True),
        basename=f"test_foot_ref_span_{name}",
    )
def test_resolution(name, source, data_regression):
    get_parse_context().span_tokens.insert_after(FootReference, CoreTokens)
    get_parse_context().block_tokens.insert_before(
        block_tokens_ext.Footnote, block_tokens.LinkDefinition
    )
    data_regression.check(
        serialize_tokens(block_tokens.Document.read(source), as_dict=True),
        basename=f"test_resolution_{name}",
    )
def test_foot_definition(name, source, data_regression):
    get_parse_context().block_tokens.insert_before(
        block_tokens_ext.Footnote, block_tokens.LinkDefinition
    )
    tree = serialize_tokens(tokenize_main(source), as_dict=True)
    footnotes = serialize_tokens(get_parse_context().foot_definitions, as_dict=True)
    data_regression.check(
        {
            "tree": tree,
            "footnotes": footnotes,
            "link_definitions": get_parse_context().link_definitions,
        },
        basename=f"test_foot_definitions_{name}",
    )
 def append_link_definitions(matches, position):
     for key, dest, title in matches:
         key = normalize_label(key)
         dest = span_tokens.EscapeSequence.strip(dest.strip())
         title = span_tokens.EscapeSequence.strip(title)
         link_definitions = get_parse_context().link_definitions
         if key not in link_definitions:
             link_definitions[key] = dest, title
         else:
             get_parse_context().logger.warning(
                 "{} ignoring duplicate link definition '{}'".format(
                     position.make_loc_str(), key
                 )
             )
Exemple #8
0
    def render_document(self, token: block_tokens.Document):
        if token.front_matter:
            self.render_front_matter(token.front_matter)
        self.render_children(token)

        if getattr(token, "is_nested", False):
            # if the document is nested in another, we don't want to output footnotes
            return self.document

        # we use the footnotes stored in the global context,
        # rather than those stored on the document,
        # since additional references may have been made in nested parses
        footnotes = get_parse_context().foot_definitions

        # we don't use the foot_references stored on the global context,
        # since references within directives/roles will have been added after
        # those from the initial markdown parse
        # instead we gather them from a walk of the created document
        # foot_refs = get_parse_context().foot_references
        foot_refs = OrderedDict()
        for refnode in self.document.traverse(nodes.footnote_reference):
            if refnode["refname"] not in foot_refs:
                foot_refs[refnode["refname"]] = True

        if foot_refs:
            self.current_node.append(nodes.transition())
        for footref in foot_refs:
            if footref in footnotes:
                self.render_footnote(footnotes[footref])

        return self.document
Exemple #9
0
 def inline_text(self, text: str, lineno: int):
     # TODO return messages?
     messages = []
     paragraph = nodes.paragraph("")
     # here we instatiate a new renderer,
     # so that the nested parse does not effect the current renderer,
     # but we use the same global parse context, so that link references, etc
     # are added to the global parse.
     renderer = self._renderer.__class__(
         document=self.document,
         current_node=paragraph,
         parse_context=get_parse_context(),
     )
     lines = SourceLines(
         text,
         start_line=self._lineno,
         uri=self.document["source"],
         metadata=self._token.position.data,
         standardize_ends=True,
     )
     doc_token = myst_block_tokens.Document.read(
         lines, front_matter=False, reset_definitions=False
     )
     # we mark the token as nested so that footnotes etc aren't rendered
     doc_token.is_nested = True
     renderer.render(doc_token)
     textnodes = []
     if paragraph.children:
         # first child should be paragraph
         textnodes = paragraph.children[0].children
     return textnodes, messages
Exemple #10
0
def match_link_label(string, offset):
    start = -1
    end = -1
    escaped = False
    for i, c in enumerate(string[offset:], start=offset):
        if c == "\\" and not escaped:
            escaped = True
        elif c == "[" and not escaped:
            if start == -1:
                start = i
            else:
                return None
        elif c == "]" and not escaped:
            end = i
            label = string[start + 1:end]
            match_info = start, end + 1, label
            if label.strip() != "":
                link_definitions = get_parse_context().link_definitions
                ref = link_definitions.get(normalize_label(label), None)
                if ref is not None:
                    return match_info, ref
                return None
            return None
        elif escaped:
            escaped = False
    return None
Exemple #11
0
def test_link_definitions(name, source, data_regression):
    tree = serialize_tokens(tokenize_main(source), as_dict=True)
    data_regression.check(
        {
            "tree": tree,
            "link_definitions": get_parse_context().link_definitions
        },
        basename=f"test_link_definitions_{name}",
    )
    def read(
        cls,
        lines: Union[str, ListType[str], SourceLines],
        reset_definitions: bool = True,
        skip_tokens: list = ("LinkDefinition", "Footnote"),
        front_matter: bool = False,
    ):
        """Read a document

        :param lines: Lines to parse
        :param reset_definitions: remove any previously stored definitions
            in the global context (see ``ParseContext.reset_definitions()``).
        :param skip_tokens: do not store these ``token.name`` in the syntax tree.
            These are usually tokens that store themselves in the global context.
        :param front_matter: search for an initial YAML block front matter block
            (note this is not strictly CommonMark compliant)
        """
        if reset_definitions:
            get_parse_context().reset_definitions()

        if not isinstance(lines, SourceLines):
            lines = SourceLines(lines, standardize_ends=True)

        # TODO can we do this in a way where we are checking
        # FrontMatter in get_parse_context().block_tokens?
        # then it would be easier to add/remove it in the renderers
        front_matter_token = None
        if front_matter and lines.peek() and lines.peek().startswith("---"):
            front_matter_token = FrontMatter.read(lines)

        children = tokenizer.tokenize_main(lines=lines, skip_tokens=skip_tokens)
        foot_defs = get_parse_context().foot_definitions
        return cls(
            children=children,
            front_matter=front_matter_token,
            link_definitions=get_parse_context().link_definitions,
            footnotes=foot_defs,
            footref_order=[
                t for t in get_parse_context().foot_references if t in foot_defs
            ],
        )
Exemple #13
0
def is_link_label(text):
    escaped = False
    for c in text:
        if c == "\\" and not escaped:
            escaped = True
        elif (c == "[" or c == "]") and not escaped:
            return None
        elif escaped:
            escaped = False
    if text.strip() != "":
        link_definitions = get_parse_context().link_definitions
        return link_definitions.get(normalize_label(text), None)
    return None
def tokenize_span(string, token_types=None):
    """Convert a string to a list of span tokens.

    :param string: the string to parse
    :param token_types: override block-level tokens set in global context

    :returns: list of span-level token instances.
    """
    if token_types is None:
        token_types = get_parse_context().span_tokens
    *token_types, fallback_token = token_types
    tokens = find_tokens(string, token_types, fallback_token)
    token_buffer = []
    if tokens:
        prev = tokens[0]
        for curr in tokens[1:]:
            prev = eval_tokens(prev, curr, token_buffer)
        token_buffer.append(prev)
    return make_tokens(token_buffer, 0, len(string), string, fallback_token)
Exemple #15
0
def tokenize_block(lines: SourceLines,
                   token_types=None,
                   skip_tokens=("LinkDefinition", "Footnote")):
    """Returns a list of parsed tokens."""
    assert isinstance(lines,
                      SourceLines), "lines must be `SourceLines` instance"
    if token_types is None:
        token_types = get_parse_context().block_tokens
    parsed_tokens = ParseBuffer()
    line = lines.peek()
    while line is not None:
        for token_type in token_types:
            if token_type.start(line):
                token = token_type.read(lines)
                if token is not None:
                    if token.name not in skip_tokens:
                        parsed_tokens.append(token)
                    break
        else:  # unmatched newlines
            next(lines)
            parsed_tokens.loose = True
        line = lines.peek()
    return parsed_tokens
Exemple #16
0
def find_nested_tokenizer(string):
    get_parse_context().nesting_matches = {}  # reset nesting matches
    if not string:
        return []
    delimiters = []
    matches = []
    escaped = False  # escaped denotes that the last cursor position had `\`
    in_delimiter_run = None  # delimiter runs are sequences of `*` or `_`
    in_image = False
    start = 0
    i = 0

    has_math = Math in get_parse_context().span_tokens
    has_strikethrough = Strikethrough in get_parse_context().span_tokens
    has_footrefs = FootReference in get_parse_context().span_tokens
    code_match, strike_match, math_match = advance_searches(
        string, 0, has_strikethrough, has_math)

    while i < len(string):

        if strike_match is not None and i == strike_match.start():
            get_parse_context().nesting_matches.setdefault(
                "Strikethrough", []).append(strike_match)
            strike_match = Strikethrough.pattern.search(string, i + 1)
            continue

        if code_match is not None and i == code_match.start():

            if in_delimiter_run:
                delimiters.append(Delimiter(start, i, string))
            in_delimiter_run = None

            get_parse_context().nesting_matches.setdefault(
                "InlineCode", []).append(code_match)
            i = code_match.end()
            code_match, strike_match, math_match = advance_searches(
                string, i, has_strikethrough, has_math)
            continue

        if math_match is not None and i == math_match.start():

            if in_delimiter_run:
                delimiters.append(Delimiter(start, i, string))
            in_delimiter_run = None

            get_parse_context().nesting_matches.setdefault(
                "Math", []).append(math_match)
            i = math_match.end()
            code_match, strike_match, math_match = advance_searches(
                string, i, has_strikethrough, has_math)
            continue

        c = string[i]
        if c == "\\" and not escaped:
            escaped = True
            i += 1
            continue
        if in_delimiter_run is not None and (c != in_delimiter_run or escaped):
            delimiters.append(
                Delimiter(start, i if not escaped else i - 1, string))
            in_delimiter_run = None
        if in_delimiter_run is None and (c == "*" or c == "_") and not escaped:
            in_delimiter_run = c
            start = i
        if not escaped:
            if c == "[":
                foot_ref_match = match_foot_ref(string,
                                                i) if has_footrefs else None
                if foot_ref_match:
                    get_parse_context().nesting_matches.setdefault(
                        "FootReference", []).append(foot_ref_match)
                    i = foot_ref_match.end()
                    in_image = False
                    continue
                if not in_image:
                    delimiters.append(Delimiter(i, i + 1, string))
                else:
                    delimiters.append(Delimiter(i - 1, i + 1, string))
                    in_image = False
            elif c == "!":
                in_image = True
            elif c == "]":
                i = find_link_image(string, i, delimiters, matches)
                code_match, strike_match, math_match = advance_searches(
                    string, i, has_strikethrough, has_math)
            elif in_image:
                in_image = False
        else:
            escaped = False
        i += 1
    if in_delimiter_run:
        delimiters.append(Delimiter(start, i, string))
    process_emphasis(string, None, delimiters, matches)
    return matches
Exemple #17
0
def test_repeated_link_defs(caplog):
    tokenize_main(["[a]: value1\n", "[a]: value2\n"])
    assert "ignoring duplicate link definition" in caplog.text
    assert len(get_parse_context().link_definitions) == 1
Exemple #18
0
 def read(cls, match: Pattern):
     target = match.group(1)
     # add the targets to an ordered set, so we record the order of reference
     get_parse_context().foot_references.add(target)
     return cls(target=target)
Exemple #19
0
 def find(cls, string):
     matches = get_parse_context().nesting_matches.pop("Math", [])
     return matches
Exemple #20
0
    def parse(self, inputstring, document):

        # de-serialize the notebook
        ntbk = nbf.reads(inputstring, nbf.NO_CONVERT)

        # This is a contaner for top level markdown tokens
        # which we will add to as we walk the document
        mkdown_tokens = []  # type: list[BlockToken]

        # First we ensure that we are using a 'clean' global context
        # for parsing, which is setup with the MyST parsing tokens
        # the logger will report on duplicate link/footnote definitions, etc
        parse_context = ParseContext(
            find_blocks=SphinxNBRenderer.default_block_tokens,
            find_spans=SphinxNBRenderer.default_span_tokens,
            logger=SPHINX_LOGGER,
        )
        set_parse_context(parse_context)

        for cell_index, nb_cell in enumerate(ntbk.cells):

            # Skip empty cells
            if len(nb_cell["source"].strip()) == 0:
                continue

            # skip cells tagged for removal
            tags = nb_cell.metadata.get("tags", [])
            if "remove_cell" in tags:
                continue

            if nb_cell["cell_type"] == "markdown":

                # we add the document path and cell index
                # to the source lines, so they can be included in the error logging
                # NOTE: currently the logic to report metadata is not written
                # into SphinxRenderer, but this will be introduced in a later update
                lines = SourceLines(
                    nb_cell["source"],
                    uri=document["source"],
                    metadata={"cell_index": cell_index},
                    standardize_ends=True,
                )

                # parse the source markdown text;
                # at this point span/inline level tokens are not yet processed, but
                # link/footnote definitions are collected/stored in the global context
                mkdown_tokens.extend(tokenize_block(lines))

                # TODO for md cells, think of a way to implement the previous
                # `if "hide_input" in tags:` logic

            elif nb_cell["cell_type"] == "code":
                # here we do nothing but store the cell as a custom token
                mkdown_tokens.append(
                    NbCodeCell(
                        cell=nb_cell,
                        position=Position(
                            line_start=0,
                            uri=document["source"],
                            data={"cell_index": cell_index},
                        ),
                    ))

        # Now all definitions have been gathered, we walk the tokens and
        # process any inline text
        for token in mkdown_tokens + list(
                get_parse_context().foot_definitions.values()):
            token.expand_spans()

        # If there are widgets, this will embed the state of all widgets in a script
        if contains_widgets(ntbk):
            mkdown_tokens.insert(0,
                                 JupyterWidgetState(state=get_widgets(ntbk)))

        # create the front matter token
        front_matter = FrontMatter(content=ntbk.metadata, position=None)

        # Finally, we create the top-level markdown document
        markdown_doc = Document(
            children=mkdown_tokens,
            front_matter=front_matter,
            link_definitions=parse_context.link_definitions,
            footnotes=parse_context.foot_definitions,
            footref_order=parse_context.foot_references,
        )

        self.reporter = document.reporter
        self.config = self.default_config.copy()
        try:
            new_cfg = document.settings.env.config.myst_config
            self.config.update(new_cfg)
        except AttributeError:
            pass

        # Remove all the mime prefixes from "glue" step.
        # This way, writing properly captures the glued images
        replace_mime = []
        for cell in ntbk.cells:
            if hasattr(cell, "outputs"):
                for out in cell.outputs:
                    if "data" in out:
                        # Only do the mimebundle replacing for the scrapbook outputs
                        mime_prefix = (out.get("metadata",
                                               {}).get("scrapbook",
                                                       {}).get("mime_prefix"))
                        if mime_prefix:
                            out["data"] = {
                                key.replace(mime_prefix, ""): val
                                for key, val in out["data"].items()
                            }
                            replace_mime.append(out)

        # Write the notebook's output to disk. This changes metadata in notebook cells
        path_doc = Path(document.settings.env.docname)
        doc_relpath = path_doc.parent
        doc_filename = path_doc.name
        build_dir = Path(document.settings.env.app.outdir).parent
        output_dir = build_dir.joinpath("jupyter_execute", doc_relpath)
        write_notebook_output(ntbk, str(output_dir), doc_filename)

        # Now add back the mime prefixes to the right outputs so they aren't rendered
        # until called from the role/directive
        for out in replace_mime:
            out["data"] = {
                f"{GLUE_PREFIX}{key}": val
                for key, val in out["data"].items()
            }

        # Update our glue key list with new ones defined in this page
        glue_domain = NbGlueDomain.from_env(document.settings.env)
        glue_domain.add_notebook(ntbk, path_doc)

        # render the Markdown AST to docutils AST
        renderer = SphinxNBRenderer(parse_context=parse_context,
                                    document=document,
                                    current_node=None)
        renderer.render(markdown_doc)
Exemple #21
0
 def find(cls, string):
     matches = get_parse_context().nesting_matches.pop("FootReference", [])
     return matches
Exemple #22
0
def myst_to_notebook(
    text,
    code_directive=CODE_DIRECTIVE,
    raw_directive=RAW_DIRECTIVE,
    ignore_bad_meta=False,
    store_line_numbers=False,
):
    """Convert text written in the myst format to a notebook.

    :param text: the file text
    :param code_directive: the name of the directive to search for containing code cells
    :param raw_directive: the name of the directive to search for containing raw cells
    :param ignore_bad_meta: ignore metadata that cannot be parsed as JSON/YAML
    :param store_line_numbers: add a `_source_lines` key to cell metadata,
        mapping to the source text.

    NOTE: we assume here that all of these directives are at the top-level,
    i.e. not nested in other directives.
    """
    from mistletoe.base_elements import SourceLines
    from mistletoe.parse_context import (
        ParseContext,
        get_parse_context,
        set_parse_context,
    )
    from mistletoe.block_tokens import Document, CodeFence

    from myst_parser.block_tokens import BlockBreak
    from myst_parser.parse_directives import DirectiveParsingError, parse_directive_text
    from myst_parser.docutils_renderer import DocutilsRenderer

    code_directive = "{{{0}}}".format(code_directive)
    raw_directive = "{{{0}}}".format(raw_directive)

    original_context = get_parse_context()
    parse_context = ParseContext(
        find_blocks=DocutilsRenderer.default_block_tokens,
        find_spans=DocutilsRenderer.default_span_tokens,
    )

    if isinstance(text, SourceLines):
        lines = text
    else:
        lines = SourceLines(text, standardize_ends=True)

    try:
        set_parse_context(parse_context)
        doc = Document.read(lines, front_matter=True)
        metadata_nb = {}
        try:
            metadata_nb = doc.front_matter.get_data() if doc.front_matter else {}
        except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
            if not ignore_bad_meta:
                raise MystMetadataParsingError("Notebook metadata: {}".format(error))
        nbf_version = nbf.v4
        kwargs = {"metadata": nbf.from_dict(metadata_nb)}
        notebook = nbf_version.new_notebook(**kwargs)

        current_line = 0 if not doc.front_matter else doc.front_matter.position.line_end
        md_metadata = {}

        for item in doc.walk(["CodeFence", "BlockBreak"]):
            if isinstance(item.node, BlockBreak):
                token = item.node  # type: BlockBreak
                source = _fmt_md(
                    "".join(lines.lines[current_line:token.position.line_start - 1])
                )
                if source:
                    md_metadata = nbf.from_dict(md_metadata)
                    if store_line_numbers:
                        md_metadata["_source_lines"] = [
                            current_line,
                            token.position.line_start - 1,
                        ]
                    notebook.cells.append(
                        nbf_version.new_markdown_cell(
                            source=source, metadata=md_metadata,
                        )
                    )
                if token.content:
                    md_metadata = {}
                    try:
                        md_metadata = json.loads(token.content.strip())
                    except Exception as err:
                        if not ignore_bad_meta:
                            raise MystMetadataParsingError(
                                "markdown cell {0} at {1} could not be read: {2}".format(
                                    len(notebook.cells) + 1, token.position, err
                                )
                            )
                    if not isinstance(md_metadata, dict):
                        if not ignore_bad_meta:
                            raise MystMetadataParsingError(
                                "markdown cell {0} at {1} is not a dict".format(
                                    len(notebook.cells) + 1, token.position
                                )
                            )
                else:
                    md_metadata = {}
                current_line = token.position.line_start
            if isinstance(item.node, CodeFence) and item.node.language in [
                code_directive,
                raw_directive,
            ]:
                token = item.node  # type: CodeFence
                # Note: we ignore anything after the directive on the first line
                # this is reserved for the optional lexer name
                # TODO: could log warning about if token.arguments != lexer name

                options, body_lines = {}, []
                try:
                    _, options, body_lines = parse_directive_text(
                        directive_class=MockDirective,
                        argument_str="",
                        content=token.children[0].content,
                        validate_options=False,
                    )
                except DirectiveParsingError as err:
                    if not ignore_bad_meta:
                        raise MystMetadataParsingError(
                            "Code cell {0} at {1} could not be read: {2}".format(
                                len(notebook.cells) + 1, token.position, err
                            )
                        )

                md_source = _fmt_md(
                    "".join(lines.lines[current_line:token.position.line_start - 1])
                )
                if md_source:
                    md_metadata = nbf.from_dict(md_metadata)
                    if store_line_numbers:
                        md_metadata["_source_lines"] = [
                            current_line,
                            token.position.line_start - 1,
                        ]
                    notebook.cells.append(
                        nbf_version.new_markdown_cell(
                            source=md_source, metadata=md_metadata,
                        )
                    )
                current_line = token.position.line_end
                md_metadata = {}

                cell_metadata = nbf.from_dict(options)
                if store_line_numbers:
                    cell_metadata["_source_lines"] = [
                        token.position.line_start,
                        token.position.line_end,
                    ]
                if item.node.language == code_directive:
                    notebook.cells.append(
                        nbf_version.new_code_cell(
                            source="\n".join(body_lines), metadata=cell_metadata,
                        )
                    )
                if item.node.language == raw_directive:
                    notebook.cells.append(
                        nbf_version.new_raw_cell(
                            source="\n".join(body_lines), metadata=cell_metadata,
                        )
                    )

        # add the final markdown cell (if present)
        if lines.lines[current_line:]:
            md_metadata = nbf.from_dict(md_metadata)
            if store_line_numbers:
                md_metadata["_source_lines"] = [current_line, len(lines.lines)]
            notebook.cells.append(
                nbf_version.new_markdown_cell(
                    source=_fmt_md("".join(lines.lines[current_line:])),
                    metadata=md_metadata,
                )
            )

    finally:
        set_parse_context(original_context)

    return notebook
def reset_parse_context():
    """Ensure the parse context is reset before each test."""
    from mistletoe.parse_context import get_parse_context

    get_parse_context(reset=True)
Exemple #24
0
def match_foot_ref(string, offset):
    match = FootReference.pattern.match(string[offset:])
    if not match:
        return
    if match.group(1) in get_parse_context().foot_definitions:
        return MatchObj(offset, match.end() + offset, (-1, -1, match.group(1)))