def parseWrappedText( self, block, reWrap, reWrapHere, tagname, break_on_whitespace=False, ): """ Parsing subscript for support message like H~2~0.""" subs = self.match(reWrapHere) if subs is None: return False after_open_subs = self.pos matched = self.match(reWrap) while matched is not None: if (matched == subs): node = Node(tagname, None) c = self.subject[after_open_subs:self.pos - len(subs)] if not break_on_whitespace: c = c.strip() c = re.sub(reWhitespace, ' ', c) elif ' ' in c: break node.literal = c block.append_child(node) return True matched = self.match(reWrap) # If we got here, we didn't math a closing sequence, or skip here self.pos = after_open_subs block.append_child(text(subs)) return True
def parseAutolink(self, block): """Attempt to parse an autolink (URL or email in pointy brackets).""" m = self.match(reEmailAutolink) if m: # email dest = m[1:-1] node = Node('Link', None) node.destination = normalize_uri('mailto:' + dest) node.title = '' node.append_child(text(dest)) block.append_child(node) return True else: m = self.match(reAutolink) if m: # link dest = m[1:-1] node = Node('Link', None) node.destination = normalize_uri(dest) node.title = '' node.append_child(text(dest)) block.append_child(node) return True return False
def parseLiqidTag(self, block): """Attempt to parse a raw Liqid tag.""" m = self.match(reLiqidBlock) if m is None: return False else: node = Node('liqid_inline', None) node.literal = m block.append_child(node) return True
def parseHtmlTag(self, block): """Attempt to parse a raw HTML tag.""" m = self.match(common.reHtmlTag) if m is None: return False else: node = Node('HtmlInline', None) node.literal = m block.append_child(node) return True
def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" block_class = getattr(import_module('CommonMark.blocks'), self.tip.t) while not block_class.can_contain(tag): self.finalize(self.tip, self.line_number - 1) block_class = getattr(import_module('CommonMark.blocks'), self.tip.t) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block
def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" block_class = getattr(import_module('CommonMark.blocks'), self.tip.t) while not block_class.can_contain(tag): self.finalize(self.tip, self.line_number - 1) block_class = getattr( import_module('CommonMark.blocks'), self.tip.t) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block
def __init__(self, subject=None, pos=0): self.doc = Node.makeNode("Document", 1, 1) self.subject = subject self.pos = pos self.tip = self.doc self.refmap = {} self.inlineParser = InlineParser()
def parseBackslash(self, block): """ Parse a backslash-escaped special character, adding either the escaped character, a hard line break (if the backslash is followed by a newline), or a literal backslash to the block's children. Assumes current character is a backslash. """ subj = self.subject self.pos += 1 try: subjchar = subj[self.pos] except IndexError: subjchar = None if self.peek() == '\n': self.pos += 1 node = Node('Hardbreak', None) block.append_child(node) elif subjchar and re.match(reEscapable, subjchar): block.append_child(text(subjchar)) self.pos += 1 else: block.append_child(text('\\')) return True
def setext_heading(parser, container=None): if not parser.indented and container.t == 'Paragraph': m = re.match(reSetextHeadingLine, parser.current_line[parser.next_nonspace:]) if m: parser.close_unmatched_blocks() heading = Node('Heading', container.sourcepos) heading.level = 1 if m.group()[0] == '=' else 2 heading.string_content = container.string_content container.insert_after(heading) container.unlink() parser.tip = heading parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 return 0
def setext_heading(parser, container=None): if not parser.indented and container.t == 'paragraph': m = re.match( reSetextHeadingLine, parser.current_line[parser.next_nonspace:]) if m: parser.close_unmatched_blocks() heading = Node('heading', container.sourcepos) heading.level = 1 if m.group()[0] == '=' else 2 heading.string_content = container.string_content container.insert_after(heading) container.unlink() parser.tip = heading parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 return 0
def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node.makeNode("Document", 1, 1) self.tip = self.doc self.refmap = {} lines = re.split(reLineEnding, re.sub(r'\n$', '', my_input)) length = len(lines) for i in range(length): self.incorporateLine(lines[i], i + 1) while (self.tip): self.finalize(self.tip, length) self.processInlines(self.doc) return self.doc
def parseBackticks(self, block): """ Attempt to parse backticks, adding either a backtick code span or a literal sequence of backticks to the 'inlines' list.""" ticks = self.match(reTicksHere) if ticks is None: return False after_open_ticks = self.pos matched = self.match(reTicks) while matched is not None: if (matched == ticks): node = Node('Code', None) c = self.subject[after_open_ticks:self.pos - len(ticks)] c = c.strip() c = re.sub(reWhitespace, ' ', c) node.literal = c block.append_child(node) return True matched = self.match(reTicks) # If we got here, we didn't match a closing backtick sequence. self.pos = after_open_ticks block.append_child(text(ticks)) return True
def parseNewline(self, block): """ Parse a newline. If it was preceded by two spaces, return a hard line break; otherwise a soft line break. """ # assume we're at a \n self.pos += 1 lastc = block.last_child if lastc and lastc.t == 'Text' and lastc.literal[-1] == ' ': hardbreak = len(lastc.literal) >= 2 and lastc.literal[-2] == ' ' lastc.literal = re.sub(reFinalSpace, '', lastc.literal) if hardbreak: node = Node('Hardbreak', None) else: node = Node('Softbreak', None) block.append_child(node) else: block.append_child(Node('Softbreak', None)) # gobble leading spaces in next line self.match(reInitialSpace) return True
def section(ast): sections = [ast] for n in children(ast): if n.t == u'Heading': doc = Node('Document', [[1, 1], [0, 0]]) n.parent = doc doc.first_child = n doc.last_child = ast.last_child if n == ast.first_child: sections = [] ast.last_child = n.prv if n.prv: n.prv.nxt = None else: ast.first_child = None n.prv = None sections.append(doc) return sections
def addChild(self, tag, line_number, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" while not (self.tip.t == "Document" or self.tip.t == "BlockQuote" or self.tip.t == "Item" or (self.tip.t == "List" and tag == "Item")): self.finalize(self.tip, line_number - 1) column_number = offset + 1 newNode = Node.makeNode(tag, line_number, column_number) self.tip.children.append(newNode) newNode.parent = self.tip self.tip = newNode return newNode
def toc(ast, autonumber, includetitle): top = [] current = [top] level = 0 for node, entering in ast.walker(): if node.t == 'heading' and entering: title = text(node) attrs = {} if node.level == level: current.pop() spec = (node.level, title, [], attrs) current[-1].append(spec) current.append(spec[2]) elif node.level > level: # Added empty intermediary levels for newLevel in range(level + 1, node.level + 1): spec = (newLevel, None if newLevel < node.level else title, [], None if newLevel < node.level else attrs) current[-1].append(spec) current.append(spec[2]) level = node.level elif node.level < level: for oldLevel in range(node.level, level + 1): current.pop() spec = (node.level, title, [], attrs) current[-1].append(spec) current.append(spec[2]) level = node.level tumbler = list(map(lambda x: len(x), current[0:-1])) if not includetitle: tumbler = tumbler[1:] node.id = 'h' + ''.join(map(lambda n: '-' + str(n), tumbler)) attrs['id'] = node.id attrs['tumbler'] = tumbler if autonumber and node.first_child: first = node.first_child node.first_child = Node('text', first.sourcepos) first.prv = node.first_child node.first_child.parent = node node.first_child.nxt = first node.first_child.literal = ''.join( map(lambda n: str(n) + '.', tumbler)) + ' ' return top
def __init__(self, options={}): self.doc = Node('Document', [[1, 1], [0, 0]]) self.block_starts = BlockStarts() self.tip = self.doc self.oldtip = self.doc self.current_line = '' self.line_number = 0 self.offset = 0 self.column = 0 self.next_nonspace = 0 self.next_nonspace_column = 0 self.indent = 0 self.indented = False self.blank = False self.all_closed = True self.last_matched_container = self.doc self.refmap = {} self.last_line_length = 0 self.inline_parser = InlineParser(options) self.options = options
def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node('document', [[1, 1], [0, 0]]) self.tip = self.doc self.refmap = {} self.line_number = 0 self.last_line_length = 0 self.offset = 0 self.column = 0 self.last_matched_container = self.doc self.current_line = '' lines = re.split(reLineEnding, my_input) length = len(lines) if len(my_input) > 0 and my_input[-1] == '\n': # ignore last blank line created by final newline length -= 1 for i in range(length): self.incorporate_line(lines[i]) while (self.tip): self.finalize(self.tip, length) self.process_inlines(self.doc) return self.doc
def test_doc_node(self): Node('document', [[1, 1], [0, 0]])
def text(s): node = Node('Text', None) node.literal = s return node
def test_node_walker_iter(self): node = Node('Document', [[1, 1], [0, 0]]) for subnode, entered in node.walker(): pass
def processEmphasis(self, stack_bottom): openers_bottom = { '_': stack_bottom, '*': stack_bottom, "'": stack_bottom, '"': stack_bottom, } use_delims = 0 # Find first closer above stack_bottom closer = self.delimiters while closer is not None and closer.get('previous') != stack_bottom: closer = closer.get('previous') # Move forward, looking for closers, and handling each while closer is not None: closercc = closer.get('cc') if not (closer.get('can_close') and (closercc == '_' or closercc == '*' or closercc == "'" or closercc == '"')): closer = closer.get('next') else: # found emphasis closer. now look back for first # matching opener: opener = closer.get('previous') opener_found = False while (opener is not None and opener != stack_bottom and opener != openers_bottom[closercc]): if opener.get('cc') == closercc and opener.get('can_open'): opener_found = True break opener = opener.get('previous') old_closer = closer if closercc == '*' or closercc == '_': if not opener_found: closer = closer.get('next') else: # Calculate actual number of delimiters used from # closer if closer['numdelims'] < 3 or opener['numdelims'] < 3: if closer['numdelims'] <= opener['numdelims']: use_delims = closer['numdelims'] else: use_delims = opener['numdelims'] else: if closer['numdelims'] % 2 == 0: use_delims = 2 else: use_delims = 1 opener_inl = opener.get('node') closer_inl = closer.get('node') # Remove used delimiters from stack elts and inlines opener['numdelims'] -= use_delims closer['numdelims'] -= use_delims opener_inl.literal = opener_inl.literal[ :len(opener_inl.literal) - use_delims] closer_inl.literal = closer_inl.literal[ :len(closer_inl.literal) - use_delims] # Build contents for new Emph element if use_delims == 1: emph = Node('Emph', None) else: emph = Node('Strong', None) tmp = opener_inl.nxt while tmp and tmp != closer_inl: nxt = tmp.nxt tmp.unlink() emph.append_child(tmp) tmp = nxt opener_inl.insert_after(emph) # Remove elts between opener and closer in delimiters # stack self.removeDelimitersBetween(opener, closer) # If opener has 0 delims, remove it and the inline if opener['numdelims'] == 0: opener_inl.unlink() self.removeDelimiter(opener) if closer['numdelims'] == 0: closer_inl.unlink() tempstack = closer['next'] self.removeDelimiter(closer) closer = tempstack elif closercc == "'": closer['node'].literal = '\u2019' if opener_found: opener['node'].literal = '\u2018' closer = closer['next'] elif closercc == '"': closer['node'].literal = '\u201D' if opener_found: opener['node'].literal = '\u201C' closer = closer['next'] if not opener_found: # Set lower bound for future searches for openers: openers_bottom[closercc] = old_closer['previous'] if not old_closer['can_open']: # We can remove a closer that can't be an opener, # once we've seen there's no matching opener: self.removeDelimiter(old_closer) # Remove all delimiters while self.delimiters is not None and self.delimiters != stack_bottom: self.removeDelimiter(self.delimiters)
def parseCloseBracket(self, block): """ Try to match close bracket against an opening in the delimiter stack. Add either a link or image, or a plain [ character, to block's children. If there is a matching delimiter, remove it from the delimiter stack. """ title = None matched = False self.pos += 1 startpos = self.pos # look through the stack of delimiters for a [ or ![ opener = self.delimiters while opener is not None: if opener.get('cc') == '[' or opener.get('cc') == '!': break opener = opener.get('previous') if opener is None: # no matched opener, just return a literal block.append_child(text(']')) return True if not opener.get('active'): # no matched opener, just return a literal block.append_child(text(']')) # take opener off emphasis stack self.removeDelimiter(opener) return True # If we got here, opener is a potential opener is_image = opener.get('cc') == '!' # Check to see if we have a link/image # Inline link? if self.peek() == '(': self.pos += 1 self.spnl() dest = self.parseLinkDestination() if dest is not None and \ self.spnl(): # make sure there's a space before the title if re.match(reWhitespaceChar, self.subject[self.pos-1]): title = self.parseLinkTitle() if self.spnl() and self.peek() == ')': self.pos += 1 matched = True else: # Next, see if there's a link label savepos = self.pos beforelabel = self.pos n = self.parseLinkLabel() if n == 0 or n == 2: # empty or missing second label reflabel = self.subject[opener['index']:startpos] else: reflabel = self.subject[beforelabel:beforelabel + n] if n == 0: # If shortcut reference link, rewind before spaces we skipped. self.pos = savepos # lookup rawlabel in refmap link = self.refmap.get(normalizeReference(reflabel)) if link: dest = link['destination'] title = link['title'] matched = True if matched: node = Node('Image' if is_image else 'Link', None) node.destination = dest node.title = title or '' tmp = opener.get('node').nxt while tmp: nxt = tmp.nxt tmp.unlink() node.append_child(tmp) tmp = nxt block.append_child(node) self.processEmphasis(opener.get('previous')) opener.get('node').unlink() # processEmphasis will remove this and later delimiters. # Now, for a link, we also deactivate earlier link openers. # (no links in links) if not is_image: opener = self.delimiters while opener is not None: if opener.get('cc') == '[': # deactivate this opener opener['active'] = False opener = opener.get('previous') return True else: # no match # remove this opener from stack self.removeDelimiter(opener) self.pos = startpos block.append_child(text(']')) return True
def test_node_walker(self): node = Node('document', [[1, 1], [0, 0]]) NodeWalker(node)
def parseCloseBracket(self, block): """ Try to match close bracket against an opening in the delimiter stack. Add either a link or image, or a plain [ character, to block's children. If there is a matching delimiter, remove it from the delimiter stack. """ title = None matched = False self.pos += 1 startpos = self.pos # get last [ or ![ opener = self.brackets if opener is None: # no matched opener, just return a literal block.append_child(text(']')) return True if not opener.get('active'): # no matched opener, just return a literal block.append_child(text(']')) # take opener off brackets stack self.removeBracket() return True # If we got here, opener is a potential opener is_image = opener.get('image') # Check to see if we have a link/image savepos = self.pos # Inline link? if self.peek() == '(': self.pos += 1 self.spnl() dest = self.parseLinkDestination() if dest is not None and self.spnl(): # make sure there's a space before the title if re.match(reWhitespaceChar, self.subject[self.pos-1]): title = self.parseLinkTitle() if self.spnl() and self.peek() == ')': self.pos += 1 matched = True else: self.pos = savepos if not matched: # Next, see if there's a link label beforelabel = self.pos n = self.parseLinkLabel() if n > 2: reflabel = self.subject[beforelabel:beforelabel + n] elif not opener.get('bracket_after'): # Empty or missing second label means to use the first # label as the reference. The reference must not # contain a bracket. If we know there's a bracket, we # don't even bother checking it. reflabel = self.subject[opener.get('index'):startpos] if n == 0: # If shortcut reference link, rewind before spaces we skipped. self.pos = savepos if reflabel: # lookup rawlabel in refmap link = self.refmap.get(normalizeReference(reflabel)) if link: dest = link['destination'] title = link['title'] matched = True if matched: node = Node('image' if is_image else 'link', None) node.destination = dest node.title = title or '' tmp = opener.get('node').nxt while tmp: nxt = tmp.nxt tmp.unlink() node.append_child(tmp) tmp = nxt block.append_child(node) self.processEmphasis(opener.get('previousDelimiter')) self.removeBracket() opener.get('node').unlink() # We remove this bracket and processEmphasis will remove # later delimiters. # Now, for a link, we also deactivate earlier link openers. # (no links in links) if not is_image: opener = self.brackets while opener is not None: if not opener.get('image'): # deactivate this opener opener['active'] = False opener = opener.get('previous') return True else: # no match # remove this opener from stack self.removeBracket() self.pos = startpos block.append_child(text(']')) return True
def text(s): node = Node("Text", None) node.literal = s return node
def parseCloseBracket(self, block): """ Try to match close bracket against an opening in the delimiter stack. Add either a link or image, or a plain [ character, to block's children. If there is a matching delimiter, remove it from the delimiter stack. """ title = None matched = False self.pos += 1 startpos = self.pos # look through the stack of delimiters for a [ or ![ opener = self.delimiters while opener is not None: if opener.get('cc') == '[' or opener.get('cc') == '!': break opener = opener.get('previous') if opener is None: # no matched opener, just return a literal block.append_child(text(']')) return True if not opener.get('active'): # no matched opener, just return a literal block.append_child(text(']')) # take opener off emphasis stack self.removeDelimiter(opener) return True # If we got here, opener is a potential opener is_image = opener.get('cc') == '!' # Check to see if we have a link/image # Inline link? if self.peek() == '(': self.pos += 1 self.spnl() dest = self.parseLinkDestination() if dest is not None and \ self.spnl(): # make sure there's a space before the title if re.match(reWhitespaceChar, self.subject[self.pos - 1]): title = self.parseLinkTitle() if self.spnl() and self.peek() == ')': self.pos += 1 matched = True else: # Next, see if there's a link label savepos = self.pos beforelabel = self.pos n = self.parseLinkLabel() if n == 0 or n == 2: # empty or missing second label reflabel = self.subject[opener['index']:startpos] else: reflabel = self.subject[beforelabel:beforelabel + n] if n == 0: # If shortcut reference link, rewind before spaces we skipped. self.pos = savepos # lookup rawlabel in refmap link = self.refmap.get(normalizeReference(reflabel)) if link: dest = link['destination'] title = link['title'] matched = True if matched: node = Node('Image' if is_image else 'Link', None) node.destination = dest node.title = title or '' tmp = opener.get('node').nxt while tmp: nxt = tmp.nxt tmp.unlink() node.append_child(tmp) tmp = nxt block.append_child(node) self.processEmphasis(opener.get('previous')) opener.get('node').unlink() # processEmphasis will remove this and later delimiters. # Now, for a link, we also deactivate earlier link openers. # (no links in links) if not is_image: opener = self.delimiters while opener is not None: if opener.get('cc') == '[': # deactivate this opener opener['active'] = False opener = opener.get('previous') return True else: # no match # remove this opener from stack self.removeDelimiter(opener) self.pos = startpos block.append_child(text(']')) return True
def processEmphasis(self, stack_bottom): openers_bottom = { '_': stack_bottom, '*': stack_bottom, "'": stack_bottom, '"': stack_bottom, } use_delims = 0 # Find first closer above stack_bottom closer = self.delimiters while closer is not None and closer.get('previous') != stack_bottom: closer = closer.get('previous') # Move forward, looking for closers, and handling each while closer is not None: closercc = closer.get('cc') if not (closer.get('can_close') and (closercc == '_' or closercc == '*' or closercc == "'" or closercc == '"')): closer = closer.get('next') else: # found emphasis closer. now look back for first # matching opener: opener = closer.get('previous') opener_found = False while (opener is not None and opener != stack_bottom and opener != openers_bottom[closercc]): if opener.get('cc') == closercc and opener.get('can_open'): opener_found = True break opener = opener.get('previous') old_closer = closer if closercc == '*' or closercc == '_': if not opener_found: closer = closer.get('next') else: # Calculate actual number of delimiters used from # closer if closer['numdelims'] < 3 or opener['numdelims'] < 3: if closer['numdelims'] <= opener['numdelims']: use_delims = closer['numdelims'] else: use_delims = opener['numdelims'] else: if closer['numdelims'] % 2 == 0: use_delims = 2 else: use_delims = 1 opener_inl = opener.get('node') closer_inl = closer.get('node') # Remove used delimiters from stack elts and inlines opener['numdelims'] -= use_delims closer['numdelims'] -= use_delims opener_inl.literal = opener_inl.literal[:len( opener_inl.literal) - use_delims] closer_inl.literal = closer_inl.literal[:len( closer_inl.literal) - use_delims] # Build contents for new Emph element if use_delims == 1: emph = Node('Emph', None) else: emph = Node('Strong', None) tmp = opener_inl.nxt while tmp and tmp != closer_inl: nxt = tmp.nxt tmp.unlink() emph.append_child(tmp) tmp = nxt opener_inl.insert_after(emph) # Remove elts between opener and closer in delimiters # stack self.removeDelimitersBetween(opener, closer) # If opener has 0 delims, remove it and the inline if opener['numdelims'] == 0: opener_inl.unlink() self.removeDelimiter(opener) if closer['numdelims'] == 0: closer_inl.unlink() tempstack = closer['next'] self.removeDelimiter(closer) closer = tempstack elif closercc == "'": closer['node'].literal = '\u2019' if opener_found: opener['node'].literal = '\u2018' closer = closer['next'] elif closercc == '"': closer['node'].literal = '\u201D' if opener_found: opener['node'].literal = '\u201C' closer = closer['next'] if not opener_found: # Set lower bound for future searches for openers: openers_bottom[closercc] = old_closer['previous'] if not old_closer['can_open']: # We can remove a closer that can't be an opener, # once we've seen there's no matching opener: self.removeDelimiter(old_closer) # Remove all delimiters while self.delimiters is not None and self.delimiters != stack_bottom: self.removeDelimiter(self.delimiters)
def test_node_walker_iter(self): node = Node('document', [[1, 1], [0, 0]]) for subnode, entered in node.walker(): pass
def nestSections(block, level=1): """ Sections aren't handled by CommonMark at the moment. This function adds sections to a block of nodes. 'title' nodes with an assigned level below 'level' will be put in a child section. If there are no child nodes with titles of level 'level' then nothing is done """ cur = block.first_child if cur is not None: children = [] # Do we need to do anything? nest = False while cur is not None: if cur.t == 'heading' and cur.level == level: nest = True break cur = cur.nxt if not nest: return section = Node('MDsection', 0) section.parent = block cur = block.first_child while cur is not None: if cur.t == 'heading' and cur.level == level: # Found a split point, flush the last section if needed if section.first_child is not None: finalizeSection(section) children.append(section) section = Node('MDsection', 0) nxt = cur.nxt # Avoid adding sections without titles at the start if section.first_child is None: if cur.t == 'heading' and cur.level == level: section.append_child(cur) else: children.append(cur) else: section.append_child(cur) cur = nxt # If there's only 1 child then don't bother if section.first_child is not None: finalizeSection(section) children.append(section) block.first_child = None block.last_child = None nextLevel = level + 1 for child in children: # Handle nesting if child.t == 'MDsection': nestSections(child, level=nextLevel) # Append if block.first_child is None: block.first_child = child else: block.last_child.nxt = child child.parent = block child.nxt = None child.prev = block.last_child block.last_child = child