def setext_heading(parser, container=None): if not parser.indented and container.t == 'paragraph': m = re.search(reSetextHeadingLine, parser.current_line[parser.next_nonspace:]) if m: parser.close_unmatched_blocks() # resolve reference link definitiosn while peek(container.string_content, 0) == '[': pos = parser.inline_parser.parseReference( container.string_content, parser.refmap) if not pos: break container.string_content = container.string_content[pos:] if container.string_content: heading = Node('heading', container.sourcepos) heading.level = 1 if m.group()[0] == '=' else 2 heading.string_content = container.string_content container.insert_after(heading) container.unlink() parser.tip = heading parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 else: return 0 return 0
def setext_heading(parser, container=None): if not parser.indented and container.t == 'paragraph': m = re.search( reSetextHeadingLine, parser.current_line[parser.next_nonspace:]) if m: parser.close_unmatched_blocks() # resolve reference link definitiosn while peek(container.string_content, 0) == '[': pos = parser.inline_parser.parseReference( container.string_content, parser.refmap) if not pos: break container.string_content = container.string_content[pos:] if container.string_content: heading = Node('heading', container.sourcepos) heading.level = 1 if m.group()[0] == '=' else 2 heading.string_content = container.string_content container.insert_after(heading) container.unlink() parser.tip = heading parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 else: return 0 return 0
def parseAutolink(self, block): """Attempt to parse an autolink (URL or email in pointy brackets).""" m = self.match(reEmailAutolink) if m: # email dest = m[1:-1] node = Node('link', None) node.destination = normalize_uri('mailto:' + dest) node.title = '' node.append_child(text(dest)) block.append_child(node) return True else: m = self.match(reAutolink) if m: # link dest = m[1:-1] node = Node('link', None) node.destination = normalize_uri(dest) node.title = '' node.append_child(text(dest)) block.append_child(node) return True return False
def parseHtmlTag(self, block): """Attempt to parse a raw HTML tag.""" m = self.match(common.reHtmlTag) if m is None: return False else: node = Node('html_inline', None) node.literal = m block.append_child(node) return True
def new_cell(first_node: Node, last_node: Node) -> Node: node = first_node cell = Node('tablecell', None) while node: if node.t != 'text' or not '|' in node.literal: cell.append_child(copy.copy(node)) if node == last_node: node = None else: node = node.nxt else: _node = node node = copy.copy(node) new_node = Node('text', None) text, remainder = node.literal.split('|', 1) node.literal = remainder.lstrip() new_node.literal = text.rstrip() if not node.literal: if _node == last_node: node = None else: node = node.nxt if new_node.literal: cell.append_child(new_node) break return cell, node
def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" while not self.blocks[self.tip.t].can_contain(tag): self.finalize(self.tip, self.line_number - 1) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block
def parseBackslash(self, block): """ Parse a backslash-escaped special character, adding either the escaped character, a hard line break (if the backslash is followed by a newline), or a literal backslash to the block's children. Assumes current character is a backslash. """ subj = self.subject self.pos += 1 try: subjchar = subj[self.pos] except IndexError: subjchar = None if self.peek() == '\n': self.pos += 1 node = Node('linebreak', None) block.append_child(node) elif subjchar and re.search(reEscapable, subjchar): block.append_child(text(subjchar)) self.pos += 1 else: block.append_child(text('\\')) return True
def convert_paragraph_to_table(original_node, entering): if not entering or original_node.t != 'paragraph': return copy_node = copy.deepcopy(original_node) new_node = Node('table', None) new_node.parent = original_node for row in yield_rows(copy_node.first_child): new_node.append_child(row) print_node_walker(new_node) if new_node.first_child is None: return original_node.first_child = new_node original_node.last_child = new_node
def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" block_class = getattr(import_module('commonmark.blocks'), to_camel_case(self.tip.t)) while not block_class.can_contain(tag): self.finalize(self.tip, self.line_number - 1) block_class = getattr(import_module('commonmark.blocks'), to_camel_case(self.tip.t)) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block
def setext_heading(parser, container=None): if not parser.indented and container.t == 'paragraph': m = re.search(reSetextHeadingLine, parser.current_line[parser.next_nonspace:]) if m: parser.close_unmatched_blocks() heading = Node('heading', container.sourcepos) heading.level = 1 if m.group()[0] == '=' else 2 heading.string_content = container.string_content container.insert_after(heading) container.unlink() parser.tip = heading parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 return 0
def new_row(first_node: Node, last_node: Node) -> Node: if first_node.t != 'text': raise NotTableError(1) if last_node.t != 'text': raise NotTableError(2) if not first_node.literal.startswith('|'): raise NotTableError(3) if not last_node.literal.rstrip().endswith('|'): raise NotTableError(4) first_node.literal = first_node.literal[1:].lstrip() last_node.literal = last_node.literal.rstrip()[:-1].rstrip() if first_node.literal == '': if first_node.nxt is None: raise NotTableError(5) first_node = first_node.nxt if last_node.literal == '': if last_node.prv is None: raise NotTableError(6) last_node = last_node.prv row = Node('tablerow', None) for cell in yield_cells(first_node, last_node): row.append_child(cell) return row
def parseBackticks(self, block): """ Attempt to parse backticks, adding either a backtick code span or a literal sequence of backticks to the 'inlines' list.""" ticks = self.match(reTicksHere) if ticks is None: return False after_open_ticks = self.pos matched = self.match(reTicks) while matched is not None: if (matched == ticks): node = Node('code', None) c = self.subject[after_open_ticks:self.pos - len(ticks)] c = c.strip() c = re.sub(reWhitespace, ' ', c) node.literal = c block.append_child(node) return True matched = self.match(reTicks) # If we got here, we didn't match a closing backtick sequence. self.pos = after_open_ticks block.append_child(text(ticks)) return True
def parseNewline(self, block): """ Parse a newline. If it was preceded by two spaces, return a hard line break; otherwise a soft line break. """ # assume we're at a \n self.pos += 1 lastc = block.last_child if lastc and lastc.t == 'text' and lastc.literal[-1] == ' ': linebreak = len(lastc.literal) >= 2 and lastc.literal[-2] == ' ' lastc.literal = re.sub(reFinalSpace, '', lastc.literal) if linebreak: node = Node('linebreak', None) else: node = Node('softbreak', None) block.append_child(node) else: block.append_child(Node('softbreak', None)) # gobble leading spaces in next line self.match(reInitialSpace) return True
def parseBackticks(self, block): """ Attempt to parse backticks, adding either a backtick code span or a literal sequence of backticks to the 'inlines' list.""" ticks = self.match(reTicksHere) if ticks is None: return False after_open_ticks = self.pos matched = self.match(reTicks) while matched is not None: if matched == ticks: node = Node('code', None) contents = self.subject[after_open_ticks:self.pos-len(ticks)] \ .replace('\n', ' ') if contents.lstrip(' ') and contents[0] == contents[-1] == ' ': node.literal = contents[1:-1] else: node.literal = contents block.append_child(node) return True matched = self.match(reTicks) # If we got here, we didn't match a closing backtick sequence. self.pos = after_open_ticks block.append_child(text(ticks)) return True
def __init__(self, options={}): self.doc = Node('document', [[1, 1], [0, 0]]) self.block_starts = BlockStarts() self.tip = self.doc self.oldtip = self.doc self.current_line = '' self.line_number = 0 self.offset = 0 self.column = 0 self.next_nonspace = 0 self.next_nonspace_column = 0 self.indent = 0 self.indented = False self.blank = False self.partially_consumed_tab = False self.all_closed = True self.last_matched_container = self.doc self.refmap = {} self.last_line_length = 0 self.inline_parser = InlineParser(options) self.options = options
def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node('document', [[1, 1], [0, 0]]) self.tip = self.doc self.refmap = {} self.line_number = 0 self.last_line_length = 0 self.offset = 0 self.column = 0 self.last_matched_container = self.doc self.current_line = '' lines = re.split(reLineEnding, my_input) length = len(lines) if len(my_input) > 0 and my_input[-1] == '\n': # ignore last blank line created by final newline length -= 1 for i in range(length): self.incorporate_line(lines[i]) while (self.tip): self.finalize(self.tip, length) self.process_inlines(self.doc) return self.doc
def processEmphasis(self, stack_bottom): openers_bottom = { '_': stack_bottom, '*': stack_bottom, "'": stack_bottom, '"': stack_bottom, } odd_match = False use_delims = 0 # Find first closer above stack_bottom closer = self.delimiters while closer is not None and closer.get('previous') != stack_bottom: closer = closer.get('previous') # Move forward, looking for closers, and handling each while closer is not None: if not closer.get('can_close'): closer = closer.get('next') else: # found emphasis closer. now look back for first # matching opener: opener = closer.get('previous') opener_found = False closercc = closer.get('cc') while (opener is not None and opener != stack_bottom and opener != openers_bottom[closercc]): odd_match = (closer.get('can_open') or opener.get('can_close')) and \ closer['origdelims'] % 3 != 0 and \ (opener['origdelims'] + closer['origdelims']) % 3 == 0 if opener.get('cc') == closercc and \ opener.get('can_open') and \ not odd_match: opener_found = True break opener = opener.get('previous') old_closer = closer if closercc == '*' or closercc == '_': if not opener_found: closer = closer.get('next') else: # Calculate actual number of delimiters used from # closer use_delims = 2 if ( closer['numdelims'] >= 2 and opener['numdelims'] >= 2) else 1 opener_inl = opener.get('node') closer_inl = closer.get('node') # Remove used delimiters from stack elts and inlines opener['numdelims'] -= use_delims closer['numdelims'] -= use_delims opener_inl.literal = opener_inl.literal[ :len(opener_inl.literal) - use_delims] closer_inl.literal = closer_inl.literal[ :len(closer_inl.literal) - use_delims] # Build contents for new Emph element if use_delims == 1: emph = Node('emph', None) else: emph = Node('strong', None) tmp = opener_inl.nxt while tmp and tmp != closer_inl: nxt = tmp.nxt tmp.unlink() emph.append_child(tmp) tmp = nxt opener_inl.insert_after(emph) # Remove elts between opener and closer in delimiters # stack self.removeDelimitersBetween(opener, closer) # If opener has 0 delims, remove it and the inline if opener['numdelims'] == 0: opener_inl.unlink() self.removeDelimiter(opener) if closer['numdelims'] == 0: closer_inl.unlink() tempstack = closer['next'] self.removeDelimiter(closer) closer = tempstack elif closercc == "'": closer['node'].literal = '\u2019' if opener_found: opener['node'].literal = '\u2018' closer = closer['next'] elif closercc == '"': closer['node'].literal = '\u201D' if opener_found: opener['node'].literal = '\u201C' closer = closer['next'] if not opener_found and not odd_match: # Set lower bound for future searches for openers: # We don't do this with odd_match because a ** # that doesn't match an earlier * might turn into # an opener, and the * might be matched by something # else. openers_bottom[closercc] = old_closer['previous'] if not old_closer['can_open']: # We can remove a closer that can't be an opener, # once we've seen there's no matching opener: self.removeDelimiter(old_closer) # Remove all delimiters while self.delimiters is not None and self.delimiters != stack_bottom: self.removeDelimiter(self.delimiters)
def test_node_walker_iter(self): node = Node('document', [[1, 1], [0, 0]]) for subnode, entered in node.walker(): pass
def processEmphasis(self, stack_bottom): openers_bottom = { '_': stack_bottom, '*': stack_bottom, "'": stack_bottom, '"': stack_bottom, } odd_match = False use_delims = 0 # Find first closer above stack_bottom closer = self.delimiters while closer is not None and closer.get('previous') != stack_bottom: closer = closer.get('previous') # Move forward, looking for closers, and handling each while closer is not None: if not closer.get('can_close'): closer = closer.get('next') else: # found emphasis closer. now look back for first # matching opener: opener = closer.get('previous') opener_found = False closercc = closer.get('cc') while (opener is not None and opener != stack_bottom and opener != openers_bottom[closercc]): odd_match = (closer.get('can_open') or opener.get('can_close')) and \ closer['origdelims'] % 3 != 0 and \ (opener['origdelims'] + closer['origdelims']) % 3 == 0 if opener.get('cc') == closercc and \ opener.get('can_open') and \ not odd_match: opener_found = True break opener = opener.get('previous') old_closer = closer if closercc == '*' or closercc == '_': if not opener_found: closer = closer.get('next') else: # Calculate actual number of delimiters used from # closer use_delims = 2 if (closer['numdelims'] >= 2 and opener['numdelims'] >= 2) else 1 opener_inl = opener.get('node') closer_inl = closer.get('node') # Remove used delimiters from stack elts and inlines opener['numdelims'] -= use_delims closer['numdelims'] -= use_delims opener_inl.literal = opener_inl.literal[:len( opener_inl.literal) - use_delims] closer_inl.literal = closer_inl.literal[:len( closer_inl.literal) - use_delims] # Build contents for new Emph element if use_delims == 1: emph = Node('emph', None) else: emph = Node('strong', None) tmp = opener_inl.nxt while tmp and tmp != closer_inl: nxt = tmp.nxt tmp.unlink() emph.append_child(tmp) tmp = nxt opener_inl.insert_after(emph) # Remove elts between opener and closer in delimiters # stack self.removeDelimitersBetween(opener, closer) # If opener has 0 delims, remove it and the inline if opener['numdelims'] == 0: opener_inl.unlink() self.removeDelimiter(opener) if closer['numdelims'] == 0: closer_inl.unlink() tempstack = closer['next'] self.removeDelimiter(closer) closer = tempstack elif closercc == "'": closer['node'].literal = '\u2019' if opener_found: opener['node'].literal = '\u2018' closer = closer['next'] elif closercc == '"': closer['node'].literal = '\u201D' if opener_found: opener['node'].literal = '\u201C' closer = closer['next'] if not opener_found and not odd_match: # Set lower bound for future searches for openers: # We don't do this with odd_match because a ** # that doesn't match an earlier * might turn into # an opener, and the * might be matched by something # else. openers_bottom[closercc] = old_closer['previous'] if not old_closer['can_open']: # We can remove a closer that can't be an opener, # once we've seen there's no matching opener: self.removeDelimiter(old_closer) # Remove all delimiters while self.delimiters is not None and self.delimiters != stack_bottom: self.removeDelimiter(self.delimiters)
def text(s): node = Node('text', None) node.literal = s return node
def parseCloseBracket(self, block): """ Try to match close bracket against an opening in the delimiter stack. Add either a link or image, or a plain [ character, to block's children. If there is a matching delimiter, remove it from the delimiter stack. """ title = None matched = False self.pos += 1 startpos = self.pos # get last [ or ![ opener = self.brackets if opener is None: # no matched opener, just return a literal block.append_child(text(']')) return True if not opener.get('active'): # no matched opener, just return a literal block.append_child(text(']')) # take opener off brackets stack self.removeBracket() return True # If we got here, opener is a potential opener is_image = opener.get('image') # Check to see if we have a link/image savepos = self.pos # Inline link? if self.peek() == '(': self.pos += 1 self.spnl() dest = self.parseLinkDestination() if dest is not None and self.spnl(): # make sure there's a space before the title if re.search(reWhitespaceChar, self.subject[self.pos-1]): title = self.parseLinkTitle() if self.spnl() and self.peek() == ')': self.pos += 1 matched = True else: self.pos = savepos if not matched: # Next, see if there's a link label beforelabel = self.pos n = self.parseLinkLabel() if n > 2: reflabel = self.subject[beforelabel:beforelabel + n] elif not opener.get('bracket_after'): # Empty or missing second label means to use the first # label as the reference. The reference must not # contain a bracket. If we know there's a bracket, we # don't even bother checking it. reflabel = self.subject[opener.get('index'):startpos] if n == 0: # If shortcut reference link, rewind before spaces we skipped. self.pos = savepos if reflabel: # lookup rawlabel in refmap link = self.refmap.get(normalize_reference(reflabel)) if link: dest = link['destination'] title = link['title'] matched = True if matched: node = Node('image' if is_image else 'link', None) node.destination = dest node.title = title or '' tmp = opener.get('node').nxt while tmp: nxt = tmp.nxt tmp.unlink() node.append_child(tmp) tmp = nxt block.append_child(node) self.processEmphasis(opener.get('previousDelimiter')) self.removeBracket() opener.get('node').unlink() # We remove this bracket and processEmphasis will remove # later delimiters. # Now, for a link, we also deactivate earlier link openers. # (no links in links) if not is_image: opener = self.brackets while opener is not None: if not opener.get('image'): # deactivate this opener opener['active'] = False opener = opener.get('previous') return True else: # no match # remove this opener from stack self.removeBracket() self.pos = startpos block.append_child(text(']')) return True
def test_doc_node(self): Node('document', [[1, 1], [0, 0]])
def test_node_walker(self): node = Node('document', [[1, 1], [0, 0]]) NodeWalker(node)
def parseCloseBracket(self, block): """ Try to match close bracket against an opening in the delimiter stack. Add either a link or image, or a plain [ character, to block's children. If there is a matching delimiter, remove it from the delimiter stack. """ title = None matched = False self.pos += 1 startpos = self.pos # get last [ or ![ opener = self.brackets if opener is None: # no matched opener, just return a literal block.append_child(text(']')) return True if not opener.get('active'): # no matched opener, just return a literal block.append_child(text(']')) # take opener off brackets stack self.removeBracket() return True # If we got here, opener is a potential opener is_image = opener.get('image') # Check to see if we have a link/image savepos = self.pos # Inline link? if self.peek() == '(': self.pos += 1 self.spnl() dest = self.parseLinkDestination() if dest is not None and self.spnl(): # make sure there's a space before the title if re.search(reWhitespaceChar, self.subject[self.pos - 1]): title = self.parseLinkTitle() if self.spnl() and self.peek() == ')': self.pos += 1 matched = True else: self.pos = savepos if not matched: # Next, see if there's a link label beforelabel = self.pos n = self.parseLinkLabel() if n > 2: reflabel = self.subject[beforelabel:beforelabel + n] elif not opener.get('bracket_after'): # Empty or missing second label means to use the first # label as the reference. The reference must not # contain a bracket. If we know there's a bracket, we # don't even bother checking it. reflabel = self.subject[opener.get('index'):startpos] if n == 0: # If shortcut reference link, rewind before spaces we skipped. self.pos = savepos if reflabel: # lookup rawlabel in refmap link = self.refmap.get(normalize_reference(reflabel)) if link: dest = link['destination'] title = link['title'] matched = True if matched: node = Node('image' if is_image else 'link', None) node.destination = dest node.title = title or '' tmp = opener.get('node').nxt while tmp: nxt = tmp.nxt tmp.unlink() node.append_child(tmp) tmp = nxt block.append_child(node) self.processEmphasis(opener.get('previousDelimiter')) self.removeBracket() opener.get('node').unlink() # We remove this bracket and processEmphasis will remove # later delimiters. # Now, for a link, we also deactivate earlier link openers. # (no links in links) if not is_image: opener = self.brackets while opener is not None: if not opener.get('image'): # deactivate this opener opener['active'] = False opener = opener.get('previous') return True else: # no match # remove this opener from stack self.removeBracket() self.pos = startpos block.append_child(text(']')) return True
def nestSections(block, level=1): """ Sections aren't handled by CommonMark at the moment. This function adds sections to a block of nodes. 'title' nodes with an assigned level below 'level' will be put in a child section. If there are no child nodes with titles of level 'level' then nothing is done """ cur = block.first_child if cur is not None: children = [] # Do we need to do anything? nest = False while cur is not None: if cur.t == 'heading' and cur.level == level: nest = True break cur = cur.nxt if not nest: return section = Node('MDsection', 0) section.parent = block cur = block.first_child while cur is not None: if cur.t == 'heading' and cur.level == level: # Found a split point, flush the last section if needed if section.first_child is not None: finalizeSection(section) children.append(section) section = Node('MDsection', 0) nxt = cur.nxt # Avoid adding sections without titles at the start if section.first_child is None: if cur.t == 'heading' and cur.level == level: section.append_child(cur) else: children.append(cur) else: section.append_child(cur) cur = nxt # If there's only 1 child then don't bother if section.first_child is not None: finalizeSection(section) children.append(section) block.first_child = None block.last_child = None nextLevel = level + 1 for child in children: # Handle nesting if child.t == 'MDsection': nestSections(child, level=nextLevel) # Append if block.first_child is None: block.first_child = child else: block.last_child.nxt = child child.parent = block child.nxt = None child.prev = block.last_child block.last_child = child