def read(cls, lines: SourceLines): start_line = lines.lineno + 1 next(lines) # skip first ``---`` line_buffer = [] next_line = lines.peek() while not (next_line is None or next_line.startswith("---")): line_buffer.append(next(lines)) next_line = lines.peek() if next_line is not None: next(lines) # move past closing ``---`` log_warning = False else: log_warning = True position = Position.from_source_lines(lines, start_line=start_line) if log_warning: get_parse_context().logger.warning( "{} No closing `---` was found for initial metadata block".format( position.make_loc_str() ) ) return cls(content="".join(line_buffer), position=position)
def inline_text(self, text: str, lineno: int): # TODO return messages? messages = [] paragraph = nodes.paragraph("") # here we instatiate a new renderer, # so that the nested parse does not effect the current renderer, # but we use the same global parse context, so that link references, etc # are added to the global parse. renderer = self._renderer.__class__( document=self.document, current_node=paragraph, parse_context=get_parse_context(), ) lines = SourceLines( text, start_line=self._lineno, uri=self.document["source"], metadata=self._token.position.data, standardize_ends=True, ) doc_token = myst_block_tokens.Document.read( lines, front_matter=False, reset_definitions=False ) # we mark the token as nested so that footnotes etc aren't rendered doc_token.is_nested = True renderer.render(doc_token) textnodes = [] if paragraph.children: # first child should be paragraph textnodes = paragraph.children[0].children return textnodes, messages
def tokenize_main( lines: SourceLines, token_types=None, expand_spans: bool = True, skip_tokens: list = ("LinkDefinition", "Footnote"), ): """Searches for token_types in an iterable. :param lines: the source lines :param token_types: override block-level tokens set in global context :param start_line: the source line number corresponding to `iterable[0]` :param expand_spans: After the initial parse the span text is not yet tokenized, but stored instead as raw text in `SpanContainer`, in order to ensure all link definitons are read first. Setting True, runs a second walk of the syntax tree to replace these `SpanContainer` with the final span tokens. :param skip_tokens: do not store these ``token.name`` in the syntax tree. These are usually tokens that store themselves in the global context :returns: list of block-level token instances. """ if not isinstance(lines, SourceLines): lines = SourceLines(lines) if token_types is None: token_types = get_parse_context().block_tokens tokens = tokenize_block(lines, token_types=token_types, skip_tokens=skip_tokens) if expand_spans: for token in tokens + list( get_parse_context().foot_definitions.values()): token.expand_spans() return tokens
def test_fenced_code(name, source, data_regression): from mistletoe.base_elements import SourceLines print(SourceLines(source).lines) data_regression.check( serialize_tokens(tokenize_main(source), as_dict=True), basename=f"test_fenced_code_{name}", )
def read(cls, lines: SourceLines): line_buffer = [] start_line = lines.lineno + 1 next_line = lines.peek() while next_line is not None and next_line.strip() != "": line_buffer.append(next(lines)) next_line = lines.peek() string = "".join(line_buffer) offset = 0 matches = [] while offset < len(string) - 1: match_info = cls.match_reference(lines, string, offset) if match_info is None: break offset, match = match_info matches.append(match) position = Position.from_source_lines(lines, start_line=start_line) cls.append_link_definitions(matches, position) return cls(position=position, definitions=matches) if matches else None
def read(cls, lines): # first line start_line = lines.lineno + 1 line = cls.convert_leading_tabs(next(lines).lstrip()).split(">", 1)[1] if len(line) > 0 and line[0] == " ": line = line[1:] line_buffer = [line] # set booleans in_code_fence = CodeFence.start(line) in_block_code = BlockCode.start(line) blank_line = line.strip() == "" # loop next_line = lines.peek() while not cls.transition(next_line): stripped = cls.convert_leading_tabs(next_line.lstrip()) prepend = 0 if stripped[0] == ">": # has leader, not lazy continuation prepend += 1 if stripped[1] == " ": prepend += 1 stripped = stripped[prepend:] in_code_fence = CodeFence.start(stripped) in_block_code = BlockCode.start(stripped) blank_line = stripped.strip() == "" line_buffer.append(stripped) elif in_code_fence or in_block_code or blank_line: # not paragraph continuation text break else: # lazy continuation, preserve whitespace line_buffer.append(next_line) next(lines) next_line = lines.peek() # block level tokens are parsed here, so that link_definitions # in quotes can be recognized before span-level tokenizing. Paragraph.parse_setext = False try: child_tokens = tokenizer.tokenize_block( SourceLines(line_buffer, start_line=start_line) ) finally: Paragraph.parse_setext = True return cls( children=child_tokens, position=Position.from_source_lines(lines, start_line=start_line), )
def tokenize_block(lines: SourceLines, token_types=None, skip_tokens=("LinkDefinition", "Footnote")): """Returns a list of parsed tokens.""" assert isinstance(lines, SourceLines), "lines must be `SourceLines` instance" if token_types is None: token_types = get_parse_context().block_tokens parsed_tokens = ParseBuffer() line = lines.peek() while line is not None: for token_type in token_types: if token_type.start(line): token = token_type.read(lines) if token is not None: if token.name not in skip_tokens: parsed_tokens.append(token) break else: # unmatched newlines next(lines) parsed_tokens.loose = True line = lines.peek() return parsed_tokens
def nested_render_text(self, text: str, lineno: int, token): """Render unparsed text.""" lines = SourceLines( text, start_line=lineno, uri=self.document["source"], metadata=token.position.data, standardize_ends=True, ) doc_token = myst_block_tokens.Document.read( lines, front_matter=True, reset_definitions=False ) # TODO think if this is the best way: here we consume front matter, # but then remove it. this is for example if includes have front matter doc_token.front_matter = None # we mark the token as nested so that footnotes etc aren't rendered doc_token.is_nested = True self.render(doc_token)
def read( cls, lines: Union[str, ListType[str], SourceLines], reset_definitions: bool = True, skip_tokens: list = ("LinkDefinition", "Footnote"), front_matter: bool = False, ): """Read a document :param lines: Lines to parse :param reset_definitions: remove any previously stored definitions in the global context (see ``ParseContext.reset_definitions()``). :param skip_tokens: do not store these ``token.name`` in the syntax tree. These are usually tokens that store themselves in the global context. :param front_matter: search for an initial YAML block front matter block (note this is not strictly CommonMark compliant) """ if reset_definitions: get_parse_context().reset_definitions() if not isinstance(lines, SourceLines): lines = SourceLines(lines, standardize_ends=True) # TODO can we do this in a way where we are checking # FrontMatter in get_parse_context().block_tokens? # then it would be easier to add/remove it in the renderers front_matter_token = None if front_matter and lines.peek() and lines.peek().startswith("---"): front_matter_token = FrontMatter.read(lines) children = tokenizer.tokenize_main(lines=lines, skip_tokens=skip_tokens) foot_defs = get_parse_context().foot_definitions return cls( children=children, front_matter=front_matter_token, link_definitions=get_parse_context().link_definitions, footnotes=foot_defs, footref_order=[ t for t in get_parse_context().foot_references if t in foot_defs ], )
def parse(self, inputstring: str, document: nodes.document): """Parse source text. :param inputstring: The source string to parse :param document: The root docutils node to add AST elements to """ # TODO add conf.py configurable settings self.config = self.default_config.copy() try: new_cfg = self.document.settings.env.config.myst_config self.config.update(new_cfg) except AttributeError: pass renderer = SphinxRenderer(document=document) with renderer: # Log to sphinx (e.g. to warn of duplicate link/footnote definitions) renderer.parse_context.logger = SPHINX_LOGGER lines = SourceLines(inputstring, uri=document["source"], standardize_ends=True) doc = Document.read(lines) renderer.render(doc)
def read(cls, lines, prev_marker=None): next_marker = None lines.anchor() prepend = -1 leader = None start_line = lines.lineno line_buffer = [] # first line line = next(lines) prepend, leader = prev_marker if prev_marker else cls.parse_marker(line) line = line.replace(leader + "\t", leader + " ", 1).replace("\t", " ") empty_first_line = line[prepend:].strip() == "" if not empty_first_line: line_buffer.append(line[prepend:]) next_line = lines.peek() if empty_first_line and next_line is not None and next_line.strip() == "": child_tokens = tokenizer.tokenize_block( SourceLines([next(lines)], start_line=lines.lineno) ) next_line = lines.peek() if next_line is not None: marker_info = cls.parse_marker(next_line) if marker_info is not None: next_marker = marker_info return cls( children=child_tokens, loose=child_tokens.loose, prepend=prepend, leader=leader, next_marker=next_marker, position=Position.from_source_lines(lines, start_line=start_line), ) # loop newline = 0 while True: # no more lines if next_line is None: # strip off newlines if newline: lines.backstep() del line_buffer[-newline:] break next_line = next_line.replace("\t", " ") # not in continuation if not cls.in_continuation(next_line, prepend): # directly followed by another token if cls.transition(next_line): if newline: lines.backstep() del line_buffer[-newline:] break # next_line is a new list item marker_info = cls.parse_marker(next_line) if marker_info is not None: next_marker = marker_info break # not another item, has newlines -> not continuation if newline: lines.backstep() del line_buffer[-newline:] break next(lines) line = next_line stripped = line.lstrip(" ") diff = len(line) - len(stripped) if diff > prepend: stripped = " " * (diff - prepend) + stripped line_buffer.append(stripped) newline = newline + 1 if next_line.strip() == "" else 0 next_line = lines.peek() child_tokens = tokenizer.tokenize_block( SourceLines(line_buffer, start_line=start_line) ) return cls( children=child_tokens, loose=child_tokens.loose, prepend=prepend, leader=leader, next_marker=next_marker, position=Position.from_source_lines(lines, start_line=start_line), )
def test_render_heading(self): renderer = TOCRenderer() Heading.start("### some *text*\n") token = Heading.read(SourceLines(["foo"]), expand_spans=True) renderer.render_heading(token) self.assertEqual(renderer._headings[0], (3, "some text"))
def myst_to_notebook( text, code_directive=CODE_DIRECTIVE, raw_directive=RAW_DIRECTIVE, ignore_bad_meta=False, store_line_numbers=False, ): """Convert text written in the myst format to a notebook. :param text: the file text :param code_directive: the name of the directive to search for containing code cells :param raw_directive: the name of the directive to search for containing raw cells :param ignore_bad_meta: ignore metadata that cannot be parsed as JSON/YAML :param store_line_numbers: add a `_source_lines` key to cell metadata, mapping to the source text. NOTE: we assume here that all of these directives are at the top-level, i.e. not nested in other directives. """ from mistletoe.base_elements import SourceLines from mistletoe.parse_context import ( ParseContext, get_parse_context, set_parse_context, ) from mistletoe.block_tokens import Document, CodeFence from myst_parser.block_tokens import BlockBreak from myst_parser.parse_directives import DirectiveParsingError, parse_directive_text from myst_parser.docutils_renderer import DocutilsRenderer code_directive = "{{{0}}}".format(code_directive) raw_directive = "{{{0}}}".format(raw_directive) original_context = get_parse_context() parse_context = ParseContext( find_blocks=DocutilsRenderer.default_block_tokens, find_spans=DocutilsRenderer.default_span_tokens, ) if isinstance(text, SourceLines): lines = text else: lines = SourceLines(text, standardize_ends=True) try: set_parse_context(parse_context) doc = Document.read(lines, front_matter=True) metadata_nb = {} try: metadata_nb = doc.front_matter.get_data() if doc.front_matter else {} except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error: if not ignore_bad_meta: raise MystMetadataParsingError("Notebook metadata: {}".format(error)) nbf_version = nbf.v4 kwargs = {"metadata": nbf.from_dict(metadata_nb)} notebook = nbf_version.new_notebook(**kwargs) current_line = 0 if not doc.front_matter else doc.front_matter.position.line_end md_metadata = {} for item in doc.walk(["CodeFence", "BlockBreak"]): if isinstance(item.node, BlockBreak): token = item.node # type: BlockBreak source = _fmt_md( "".join(lines.lines[current_line:token.position.line_start - 1]) ) if source: md_metadata = nbf.from_dict(md_metadata) if store_line_numbers: md_metadata["_source_lines"] = [ current_line, token.position.line_start - 1, ] notebook.cells.append( nbf_version.new_markdown_cell( source=source, metadata=md_metadata, ) ) if token.content: md_metadata = {} try: md_metadata = json.loads(token.content.strip()) except Exception as err: if not ignore_bad_meta: raise MystMetadataParsingError( "markdown cell {0} at {1} could not be read: {2}".format( len(notebook.cells) + 1, token.position, err ) ) if not isinstance(md_metadata, dict): if not ignore_bad_meta: raise MystMetadataParsingError( "markdown cell {0} at {1} is not a dict".format( len(notebook.cells) + 1, token.position ) ) else: md_metadata = {} current_line = token.position.line_start if isinstance(item.node, CodeFence) and item.node.language in [ code_directive, raw_directive, ]: token = item.node # type: CodeFence # Note: we ignore anything after the directive on the first line # this is reserved for the optional lexer name # TODO: could log warning about if token.arguments != lexer name options, body_lines = {}, [] try: _, options, body_lines = parse_directive_text( directive_class=MockDirective, argument_str="", content=token.children[0].content, validate_options=False, ) except DirectiveParsingError as err: if not ignore_bad_meta: raise MystMetadataParsingError( "Code cell {0} at {1} could not be read: {2}".format( len(notebook.cells) + 1, token.position, err ) ) md_source = _fmt_md( "".join(lines.lines[current_line:token.position.line_start - 1]) ) if md_source: md_metadata = nbf.from_dict(md_metadata) if store_line_numbers: md_metadata["_source_lines"] = [ current_line, token.position.line_start - 1, ] notebook.cells.append( nbf_version.new_markdown_cell( source=md_source, metadata=md_metadata, ) ) current_line = token.position.line_end md_metadata = {} cell_metadata = nbf.from_dict(options) if store_line_numbers: cell_metadata["_source_lines"] = [ token.position.line_start, token.position.line_end, ] if item.node.language == code_directive: notebook.cells.append( nbf_version.new_code_cell( source="\n".join(body_lines), metadata=cell_metadata, ) ) if item.node.language == raw_directive: notebook.cells.append( nbf_version.new_raw_cell( source="\n".join(body_lines), metadata=cell_metadata, ) ) # add the final markdown cell (if present) if lines.lines[current_line:]: md_metadata = nbf.from_dict(md_metadata) if store_line_numbers: md_metadata["_source_lines"] = [current_line, len(lines.lines)] notebook.cells.append( nbf_version.new_markdown_cell( source=_fmt_md("".join(lines.lines[current_line:])), metadata=md_metadata, ) ) finally: set_parse_context(original_context) return notebook
def parse(self, inputstring, document): # de-serialize the notebook ntbk = nbf.reads(inputstring, nbf.NO_CONVERT) # This is a contaner for top level markdown tokens # which we will add to as we walk the document mkdown_tokens = [] # type: list[BlockToken] # First we ensure that we are using a 'clean' global context # for parsing, which is setup with the MyST parsing tokens # the logger will report on duplicate link/footnote definitions, etc parse_context = ParseContext( find_blocks=SphinxNBRenderer.default_block_tokens, find_spans=SphinxNBRenderer.default_span_tokens, logger=SPHINX_LOGGER, ) set_parse_context(parse_context) for cell_index, nb_cell in enumerate(ntbk.cells): # Skip empty cells if len(nb_cell["source"].strip()) == 0: continue # skip cells tagged for removal tags = nb_cell.metadata.get("tags", []) if "remove_cell" in tags: continue if nb_cell["cell_type"] == "markdown": # we add the document path and cell index # to the source lines, so they can be included in the error logging # NOTE: currently the logic to report metadata is not written # into SphinxRenderer, but this will be introduced in a later update lines = SourceLines( nb_cell["source"], uri=document["source"], metadata={"cell_index": cell_index}, standardize_ends=True, ) # parse the source markdown text; # at this point span/inline level tokens are not yet processed, but # link/footnote definitions are collected/stored in the global context mkdown_tokens.extend(tokenize_block(lines)) # TODO for md cells, think of a way to implement the previous # `if "hide_input" in tags:` logic elif nb_cell["cell_type"] == "code": # here we do nothing but store the cell as a custom token mkdown_tokens.append( NbCodeCell( cell=nb_cell, position=Position( line_start=0, uri=document["source"], data={"cell_index": cell_index}, ), )) # Now all definitions have been gathered, we walk the tokens and # process any inline text for token in mkdown_tokens + list( get_parse_context().foot_definitions.values()): token.expand_spans() # If there are widgets, this will embed the state of all widgets in a script if contains_widgets(ntbk): mkdown_tokens.insert(0, JupyterWidgetState(state=get_widgets(ntbk))) # create the front matter token front_matter = FrontMatter(content=ntbk.metadata, position=None) # Finally, we create the top-level markdown document markdown_doc = Document( children=mkdown_tokens, front_matter=front_matter, link_definitions=parse_context.link_definitions, footnotes=parse_context.foot_definitions, footref_order=parse_context.foot_references, ) self.reporter = document.reporter self.config = self.default_config.copy() try: new_cfg = document.settings.env.config.myst_config self.config.update(new_cfg) except AttributeError: pass # Remove all the mime prefixes from "glue" step. # This way, writing properly captures the glued images replace_mime = [] for cell in ntbk.cells: if hasattr(cell, "outputs"): for out in cell.outputs: if "data" in out: # Only do the mimebundle replacing for the scrapbook outputs mime_prefix = (out.get("metadata", {}).get("scrapbook", {}).get("mime_prefix")) if mime_prefix: out["data"] = { key.replace(mime_prefix, ""): val for key, val in out["data"].items() } replace_mime.append(out) # Write the notebook's output to disk. This changes metadata in notebook cells path_doc = Path(document.settings.env.docname) doc_relpath = path_doc.parent doc_filename = path_doc.name build_dir = Path(document.settings.env.app.outdir).parent output_dir = build_dir.joinpath("jupyter_execute", doc_relpath) write_notebook_output(ntbk, str(output_dir), doc_filename) # Now add back the mime prefixes to the right outputs so they aren't rendered # until called from the role/directive for out in replace_mime: out["data"] = { f"{GLUE_PREFIX}{key}": val for key, val in out["data"].items() } # Update our glue key list with new ones defined in this page glue_domain = NbGlueDomain.from_env(document.settings.env) glue_domain.add_notebook(ntbk, path_doc) # render the Markdown AST to docutils AST renderer = SphinxNBRenderer(parse_context=parse_context, document=document, current_node=None) renderer.render(markdown_doc)