class Parser(object): def __init__(self, options={}): self.doc = Node('document', [[1, 1], [0, 0]]) self.block_starts = BlockStarts() self.tip = self.doc self.oldtip = self.doc self.current_line = '' self.line_number = 0 self.offset = 0 self.column = 0 self.next_nonspace = 0 self.next_nonspace_column = 0 self.indent = 0 self.indented = False self.blank = False self.partially_consumed_tab = False self.all_closed = True self.last_matched_container = self.doc self.refmap = {} self.last_line_length = 0 self.inline_parser = InlineParser(options) self.options = options def add_line(self): """ Add a line to the block at the tip. We assume the tip can accept lines -- that check should be done before calling this.""" if self.partially_consumed_tab: # Skip over tab self.offset += 1 # Add space characters chars_to_tab = 4 - (self.column % 4) self.tip.string_content += (' ' * chars_to_tab) self.tip.string_content += (self.current_line[self.offset:] + '\n') def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(self.tip.t)) while not block_class.can_contain(tag): self.finalize(self.tip, self.line_number - 1) block_class = getattr( import_module('CommonMark.blocks'), to_camel_case(self.tip.t)) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block def close_unmatched_blocks(self): """Finalize and close any unmatched blocks.""" if not self.all_closed: while self.oldtip != self.last_matched_container: parent = self.oldtip.parent self.finalize(self.oldtip, self.line_number - 1) self.oldtip = parent self.all_closed = True def find_next_nonspace(self): current_line = self.current_line i = self.offset cols = self.column try: c = current_line[i] except IndexError: c = '' while c != '': if c == ' ': i += 1 cols += 1 elif c == '\t': i += 1 cols += (4 - (cols % 4)) else: break try: c = current_line[i] except IndexError: c = '' self.blank = (c == '\n' or c == '\r' or c == '') self.next_nonspace = i self.next_nonspace_column = cols self.indent = self.next_nonspace_column - self.column self.indented = self.indent >= CODE_INDENT def advance_next_nonspace(self): self.offset = self.next_nonspace self.column = self.next_nonspace_column self.partially_consumed_tab = False def advance_offset(self, count, columns): current_line = self.current_line try: c = current_line[self.offset] except IndexError: c = None while count > 0 and c is not None: if c == '\t': chars_to_tab = 4 - (self.column % 4) if columns: self.partially_consumed_tab = chars_to_tab > count chars_to_advance = min(count, chars_to_tab) self.column += chars_to_advance self.offset += 0 if self.partially_consumed_tab else 1 count -= chars_to_advance else: self.partially_consumed_tab = False self.column += chars_to_tab self.offset += 1 count -= 1 else: self.partially_consumed_tab = False self.offset += 1 # assume ascii; block starts are ascii self.column += 1 count -= 1 try: c = current_line[self.offset] except IndexError: c = None def incorporate_line(self, ln): """Analyze a line of text and update the document appropriately. We parse markdown text by calling this on each line of input, then finalizing the document. """ all_matched = True container = self.doc self.oldtip = self.tip self.offset = 0 self.column = 0 self.blank = False self.partially_consumed_tab = False self.line_number += 1 # replace NUL characters for security if re.search(r'\u0000', ln) is not None: ln = re.sub(r'\0', '\uFFFD', ln) self.current_line = ln # For each containing block, try to parse the associated line start. # Bail out on failure: container will point to the last matching block. # Set all_matched to false if not all containers match. last_child = container.last_child while last_child and last_child.is_open: container = last_child self.find_next_nonspace() block_class = getattr( import_module('CommonMark.blocks'), to_camel_case(container.t)) rv = block_class.continue_(self, container) if rv == 0: # we've matched, keep going pass elif rv == 1: # we've failed to match a block all_matched = False elif rv == 2: # we've hit end of line for fenced code close and can return self.last_line_length = len(ln) return else: raise ValueError('returned illegal value, must be 0, 1, or 2') if not all_matched: # back up to last matching block container = container.parent break last_child = container.last_child self.all_closed = (container == self.oldtip) self.last_matched_container = container block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(container.t)) matched_leaf = container.t != 'paragraph' and block_class.accepts_lines starts = self.block_starts starts_len = len(starts.METHODS) # Unless last matched container is a code block, try new container # starts, adding children to the last matched container: while not matched_leaf: self.find_next_nonspace() # this is a little performance optimization: if not self.indented and \ not re.search(reMaybeSpecial, ln[self.next_nonspace:]): self.advance_next_nonspace() break i = 0 while i < starts_len: res = getattr(starts, starts.METHODS[i])(self, container) if res == 1: container = self.tip break elif res == 2: container = self.tip matched_leaf = True break else: i += 1 if i == starts_len: # nothing matched self.advance_next_nonspace() break # What remains at the offset is a text line. Add the text to the # appropriate container. if not self.all_closed and not self.blank and \ self.tip.t == 'paragraph': # lazy paragraph continuation self.add_line() else: # not a lazy continuation # finalize any blocks not matched self.close_unmatched_blocks() if self.blank and container.last_child: container.last_child.last_line_blank = True t = container.t # Block quote lines are never blank as they start with > # and we don't count blanks in fenced code for purposes of # tight/loose lists or breaking out of lists. We also # don't set last_line_blank on an empty list item, or if we # just closed a fenced block. last_line_blank = self.blank and \ not (t == 'block_quote' or (t == 'code_block' and container.is_fenced) or (t == 'item' and not container.first_child and container.sourcepos[0][0] == self.line_number)) # propagate last_line_blank up through parents: cont = container while cont: cont.last_line_blank = last_line_blank cont = cont.parent block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(t)) if block_class.accepts_lines: self.add_line() # if HtmlBlock, check for end condition if t == 'html_block' and \ container.html_block_type >= 1 and \ container.html_block_type <= 5 and \ re.search( reHtmlBlockClose[container.html_block_type], self.current_line[self.offset:]): self.finalize(container, self.line_number) elif self.offset < len(ln) and not self.blank: # create a paragraph container for one line container = self.add_child('paragraph', self.offset) self.advance_next_nonspace() self.add_line() self.last_line_length = len(ln) def finalize(self, block, line_number): """ Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings, setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference definitions. Reset the tip to the parent of the closed block.""" above = block.parent block.is_open = False block.sourcepos[1] = [line_number, self.last_line_length] block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(block.t)) block_class.finalize(self, block) self.tip = above def process_inlines(self, block): """ Walk through a block & children recursively, parsing string content into inline content where appropriate. """ walker = block.walker() self.inline_parser.refmap = self.refmap self.inline_parser.options = self.options event = walker.nxt() while event is not None: node = event['node'] t = node.t if not event['entering'] and (t == 'paragraph' or t == 'heading'): self.inline_parser.parse(node) event = walker.nxt() def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node('document', [[1, 1], [0, 0]]) self.tip = self.doc self.refmap = {} self.line_number = 0 self.last_line_length = 0 self.offset = 0 self.column = 0 self.last_matched_container = self.doc self.current_line = '' lines = re.split(reLineEnding, my_input) length = len(lines) if len(my_input) > 0 and my_input[-1] == '\n': # ignore last blank line created by final newline length -= 1 for i in range(length): self.incorporate_line(lines[i]) while (self.tip): self.finalize(self.tip, length) self.process_inlines(self.doc) return self.doc
class Parser: def __init__(self, subject=None, pos=0): self.doc = Node.makeNode("Document", 1, 1) self.subject = subject self.pos = pos self.tip = self.doc self.refmap = {} self.inlineParser = InlineParser() def acceptsLines(self, block_type): """ Returns true if block type can accept lines of text.""" return block_type == 'Paragraph' or \ block_type == 'IndentedCode' or \ block_type == 'FencedCode' or \ block_type == 'HtmlBlock' def endsWithBlankLine(self, block): """ Returns true if block ends with a blank line, descending if needed into lists and sublists.""" if block.last_line_blank: return True if (block.t == "List" or block.t == "Item") and \ len(block.children) > 0: return self.endsWithBlankLine(block.children[-1]) else: return False def breakOutOfLists(self, block, line_number): """ Break out of all containing lists, resetting the tip of the document to the parent of the highest list, and finalizing all the lists. (This is used to implement the "two blank lines break out of all lists" feature.)""" b = block last_list = None while True: if (b.t == "List"): last_list = b b = b.parent if not b: break if (last_list): while block != last_list: self.finalize(block, line_number) block = block.parent self.finalize(last_list, line_number) self.tip = last_list.parent def addLine(self, ln, offset): """ Add a line to the block at the tip. We assume the tip can accept lines -- that check should be done before calling this.""" s = ln[offset:] if not self.tip.is_open: raise Exception( "Attempted to add line (" + ln + ") to closed container.") self.tip.strings.append(s) def addChild(self, tag, line_number, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" while not (self.tip.t == "Document" or self.tip.t == "BlockQuote" or self.tip.t == "Item" or (self.tip.t == "List" and tag == "Item")): self.finalize(self.tip, line_number - 1) column_number = offset + 1 newNode = Node.makeNode(tag, line_number, column_number) self.tip.children.append(newNode) newNode.parent = self.tip self.tip = newNode return newNode def listsMatch(self, list_data, item_data): """ Returns true if the two list items are of the same type, with the same delimiter and bullet character. This is used in agglomerating list items into lists.""" return (list_data.get("type", None) == item_data.get("type", None) and list_data.get("delimiter", None) == item_data.get("delimiter", None) and list_data.get("bullet_char", None) == item_data.get("bullet_char", None)) def parseListMarker(self, ln, offset): """ Parse a list marker and return data on the marker (type, start, delimiter, bullet character, padding) or None.""" rest = ln[offset:] data = {} blank_item = bool() if re.match(reHrule, rest): return None match = re.search(reBulletListMarker, rest) match2 = re.search(reOrderedListMarker, rest) if match: spaces_after_marker = len(match.group(1)) data['type'] = 'Bullet' data['bullet_char'] = match.group(0)[0] blank_item = match.group(0) == len(rest) elif match2: spaces_after_marker = len(match2.group(3)) data['type'] = 'Ordered' data['start'] = int(match2.group(1)) data['delimiter'] = match2.group(2) blank_item = match2.group(0) == len(rest) else: return None if spaces_after_marker >= 5 or spaces_after_marker < 1 or blank_item: if match: data['padding'] = len(match.group(0)) - spaces_after_marker + 1 elif match2: data['padding'] = len( match2.group(0)) - spaces_after_marker + 1 else: if match: data['padding'] = len(match.group(0)) elif match2: data['padding'] = len(match2.group(0)) return data def parseIAL(self, ln): values = [] css_class = re.findall(r"\.(\w+) *", ln) if css_class: values.append(("class", " ".join(css_class))) css_id = re.findall(r"\#.(\w+) *", ln) if css_id: values.append(("id", css_id[0])) keyed_values = re.findall(r"(\w+)(?:=(\w+))? *", ln) if keyed_values: values += keyed_values return dict(values) def incorporateLine(self, ln, line_number): """ Analyze a line of text and update the document appropriately. We parse markdown text by calling this on each line of input, then finalizing the document.""" all_matched = True offset = 0 CODE_INDENT = 4 blank = None already_done = False container = self.doc oldtip = self.tip ln = detabLine(ln) while len(container.children) > 0: last_child = container.children[-1] if not last_child.is_open: break container = last_child match = matchAt(r"[^ ]", ln, offset) if match is None: first_nonspace = len(ln) blank = True else: first_nonspace = match blank = False indent = first_nonspace - offset if container.t == "BlockQuote": matched = bool() if len(ln) > first_nonspace and len(ln) > 0: matched = ln[first_nonspace] == ">" matched = indent <= 3 and matched if matched: offset = first_nonspace + 1 try: if ln[offset] == " ": offset += 1 except IndexError: pass else: all_matched = False elif container.t == "Item": if (indent >= container.list_data['marker_offset'] + container.list_data['padding']): offset += container.list_data[ 'marker_offset'] + container.list_data['padding'] elif blank: offset = first_nonspace else: all_matched = False elif container.t == "IndentedCode": if indent >= CODE_INDENT: offset += CODE_INDENT elif blank: offset = first_nonspace else: all_matched = False elif container.t in ["ATXHeader", "SetextHeader", "HorizontalRule"]: all_matched = False elif container.t == "FencedCode": i = container.fence_offset while i > 0 and len(ln) > offset and ln[offset] == " ": offset += 1 i -= 1 elif container.t == "HtmlBlock": if blank: all_matched = False elif container.t == "Paragraph": if blank: container.last_line_blank = True all_matched = False if not all_matched: container = container.parent break last_matched_container = container def closeUnmatchedBlocks(self, already_done, oldtip): """ This function is used to finalize and close any unmatched blocks. We aren't ready to do this now, because we might have a lazy paragraph continuation, in which case we don't want to close unmatched blocks. So we store this closure for use later, when we have more information.""" while not already_done and not oldtip == last_matched_container: self.finalize(oldtip, line_number) oldtip = oldtip.parent return True, oldtip if blank and container.last_line_blank: self.breakOutOfLists(container, line_number) while container.t != "ExtensionBlock" and \ container.t != "FencedCode" and \ container.t != "IndentedCode" and \ container.t != "HtmlBlock" and \ matchAt(r"^[ #`~*+_=<>0-9-{]", ln, offset) is not None: match = matchAt("[^ ]", ln, offset) if match is None: first_nonspace = len(ln) blank = True else: first_nonspace = match blank = False ATXmatch = re.search(reATXHeaderMarker, ln[first_nonspace:]) FENmatch = re.search(reCodeFence, ln[first_nonspace:]) PARmatch = re.search(reSetextHeaderLine, ln[first_nonspace:]) IALmatch = re.search(r"^{:((\}|[^}])*)} *$", ln[first_nonspace:]) EXTmatch = re.search(r"^{::((\\\}|[^\\}])*)/?} *$", ln[first_nonspace:]) data = self.parseListMarker(ln, first_nonspace) indent = first_nonspace - offset if data: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) data['marker_offset'] = indent offset = first_nonspace + data['padding'] if not container.t == "List" or not self.listsMatch( container.list_data, data): container = self.addChild( "List", line_number, first_nonspace) container.list_data = data container = self.addChild( "Item", line_number, first_nonspace) container.list_data = data elif indent >= CODE_INDENT: if not self.tip.t == "Paragraph" and not blank: offset += CODE_INDENT already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild( 'IndentedCode', line_number, offset) else: break elif len(ln) > first_nonspace and ln[first_nonspace] == ">": offset = first_nonspace + 1 try: if ln[offset] == " ": offset += 1 except IndexError: pass already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild("BlockQuote", line_number, offset) elif EXTmatch: args = EXTmatch.group(1) keyed_values = re.findall(r"(\w+)(?:=(\w+))? *", args) offset = first_nonspace + len(EXTmatch.group(0)) print("EXT {} {}".format(args, offset)) already_done, oldtip = closeUnmatchedBlocks(self, already_done, oldtip) container = self.addChild("ExtensionBlock", line_number, first_nonspace) container.title = keyed_values.pop(0)[0] container.attributes = dict(keyed_values) print(EXTmatch.group(0)) print(args) if (EXTmatch.group(0)[-2] == '/'): self.finalize(container, line_number) break elif IALmatch: offset = first_nonspace + len(IALmatch.group(0)) print("Found {}".format(IALmatch.group(0))) print("blank {}".format(blank)) print("container {} {}".format( self.tip.t, container.last_line_blank)) if blank: # FIXME # attributes.update(self.parseIAL(IALmatch.group(1))) pass else: self.tip.attributes = self.parseIAL(IALmatch.group(1)) break elif ATXmatch: offset = first_nonspace + len(ATXmatch.group(0)) already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild( "ATXHeader", line_number, first_nonspace) container.level = len(ATXmatch.group(0).strip()) if not re.search(r'\\#', ln[offset:]) is None: container.strings = [ re.sub(r'(?:(\\#) *#*| *#+) *$', '\g<1>', ln[offset:])] else: container.strings = [ re.sub(r'(?:(\\#) *#*| *#+) *$', '', ln[offset:])] break elif FENmatch: fence_length = len(FENmatch.group(0)) already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild( "FencedCode", line_number, first_nonspace) container.fence_length = fence_length container.fence_char = FENmatch.group(0)[0] container.fence_offset = first_nonspace - offset offset = first_nonspace + fence_length break elif not matchAt(reHtmlBlockOpen, ln, first_nonspace) is None: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild( 'HtmlBlock', line_number, first_nonspace) break elif container.t == "Paragraph" and \ len(container.strings) == 1 and PARmatch: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container.t = "SetextHeader" container.level = 1 if PARmatch.group(0)[0] == '=' else 2 offset = len(ln) elif not matchAt(reHrule, ln, first_nonspace) is None: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild( "HorizontalRule", line_number, first_nonspace) offset = len(ln) - 1 break else: break if self.acceptsLines(container.t): break match = matchAt(r"[^ ]", ln, offset) if match is None: first_nonspace = len(ln) blank = True else: first_nonspace = match blank = False indent = first_nonspace - offset if not self.tip == last_matched_container and \ not blank and self.tip.t == "Paragraph" and \ len(self.tip.strings) > 0: self.last_line_blank = False self.addLine(ln, offset) else: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container.last_line_blank = \ blank and \ not (container.t == "BlockQuote" or container.t == "FencedCode" or (container.t == "Item" and len(container.children) == 0 and container.start_line == line_number)) cont = container while cont.parent: cont.parent.last_line_blank = False cont = cont.parent if container.t == "IndentedCode" or container.t == "HtmlBlock": self.addLine(ln, offset) if container.t == "ExtensionBlock": EXTmatch = re.search(r"^{:/((\\\}|[^\\}])*)} *$", ln[first_nonspace:]) if EXTmatch: self.finalize(container, line_number) else: self.addLine(ln, offset) elif container.t == "FencedCode": match = bool() if len(ln) > 0: match = len(ln) > first_nonspace and \ ln[first_nonspace] == container.fence_char and \ re.match( r"^(?:`{3,}|~{3,})(?= *$)", ln[first_nonspace:]) match = indent <= 3 and match FENmatch = re.search( r"^(?:`{3,}|~{3,})(?= *$)", ln[first_nonspace:]) if match and len(FENmatch.group(0)) >= container.fence_length: self.finalize(container, line_number) else: self.addLine(ln, offset) elif container.t in ["ATXHeader", "SetextHeader", "HtmlBlock"]: # nothing to do; we already added the contents. pass else: if self.acceptsLines(container.t): self.addLine(ln, first_nonspace) elif blank: pass elif container.t != "HorizontalRule" and \ container.t != "SetextHeader": container = self.addChild( "Paragraph", line_number, first_nonspace) self.addLine(ln, first_nonspace) else: # print("Line " + str(line_number) + # " with container type " + # container.t + " did not match any condition.") pass def finalize(self, block, line_number): """ Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings, setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference definitions. Reset the tip to the parent of the closed block.""" if (not block.is_open): return 0 block.is_open = False if (line_number > block.start_line): block.end_line = line_number - 1 else: block.end_line = line_number if (block.t == "Paragraph"): block.string_content = "" for i, line in enumerate(block.strings): block.strings[i] = re.sub(r'^ *', '', line, re.MULTILINE) block.string_content = '\n'.join(block.strings) pos = self.inlineParser.parseReference( block.string_content, self.refmap) while (block.string_content[0] == "[" and pos): block.string_content = block.string_content[pos:] if (isBlank(block.string_content)): block.t = "ReferenceDef" break pos = self.inlineParser.parseReference( block.string_content, self.refmap) elif (block.t in ["ATXHeader", "SetextHeader", "HtmlBlock"]): block.string_content = "\n".join(block.strings) elif (block.t == "IndentedCode"): block.string_content = re.sub( r"(\n *)*$", "\n", "\n".join(block.strings)) elif (block.t == "FencedCode"): block.info = unescape(block.strings[0].strip()) if (len(block.strings) == 1): block.string_content = "" else: block.string_content = "\n".join(block.strings[1:]) + "\n" elif (block.t == "List"): block.tight = True numitems = len(block.children) i = 0 while (i < numitems): item = block.children[i] last_item = (i == numitems-1) if (self.endsWithBlankLine(item) and not last_item): block.tight = False break numsubitems = len(item.children) j = 0 while (j < numsubitems): subitem = item.children[j] last_subitem = j == (numsubitems - 1) if (self.endsWithBlankLine(subitem) and not (last_item and last_subitem)): block.tight = False break j += 1 i += 1 else: pass self.tip = block.parent def processInlines(self, block): """ Walk through a block & children recursively, parsing string content into inline content where appropriate.""" if block.t in ["ATXHeader", "Paragraph", "SetextHeader"]: block.inline_content = self.inlineParser.parse( block.string_content.strip(), self.refmap) block.string_content = "" if block.children: for i in block.children: self.processInlines(i) def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node.makeNode("Document", 1, 1) self.tip = self.doc self.refmap = {} lines = re.split(reLineEnding, re.sub(r'\n$', '', my_input)) length = len(lines) for i in range(length): self.incorporateLine(lines[i], i + 1) while (self.tip): self.finalize(self.tip, length) self.processInlines(self.doc) return self.doc
class Parser(object): def __init__(self, options={}): self.doc = Node('document', [[1, 1], [0, 0]]) self.block_starts = BlockStarts() self.tip = self.doc self.oldtip = self.doc self.current_line = '' self.line_number = 0 self.offset = 0 self.column = 0 self.next_nonspace = 0 self.next_nonspace_column = 0 self.indent = 0 self.indented = False self.blank = False self.partially_consumed_tab = False self.all_closed = True self.last_matched_container = self.doc self.refmap = {} self.last_line_length = 0 self.inline_parser = InlineParser(options) self.options = options def break_out_of_lists(self, block): """ Break out of all containing lists, resetting the tip of the document to the parent of the highest list, and finalizing all the lists. (This is used to implement the "two blank lines break out of all lists" feature.) """ b = block last_list = None while True: if (b.t == 'list'): last_list = b b = b.parent if not b: break if (last_list): while block != last_list: self.finalize(block, self.line_number) block = block.parent self.finalize(last_list, self.line_number) self.tip = last_list.parent def add_line(self): """ Add a line to the block at the tip. We assume the tip can accept lines -- that check should be done before calling this.""" if self.partially_consumed_tab: # Skip over tab self.offset += 1 # Add space characters chars_to_tab = 4 - (self.column % 4) self.tip.string_content += (' ' * chars_to_tab) self.tip.string_content += (self.current_line[self.offset:] + '\n') def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(self.tip.t)) while not block_class.can_contain(tag): self.finalize(self.tip, self.line_number - 1) block_class = getattr( import_module('CommonMark.blocks'), to_camel_case(self.tip.t)) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block def close_unmatched_blocks(self): """Finalize and close any unmatched blocks.""" if not self.all_closed: while self.oldtip != self.last_matched_container: parent = self.oldtip.parent self.finalize(self.oldtip, self.line_number - 1) self.oldtip = parent self.all_closed = True def find_next_nonspace(self): current_line = self.current_line i = self.offset cols = self.column try: c = current_line[i] except IndexError: c = '' while c != '': if c == ' ': i += 1 cols += 1 elif c == '\t': i += 1 cols += (4 - (cols % 4)) else: break try: c = current_line[i] except IndexError: c = '' self.blank = (c == '\n' or c == '\r' or c == '') self.next_nonspace = i self.next_nonspace_column = cols self.indent = self.next_nonspace_column - self.column self.indented = self.indent >= CODE_INDENT def advance_next_nonspace(self): self.offset = self.next_nonspace self.column = self.next_nonspace_column self.partially_consumed_tab = False def advance_offset(self, count, columns): cols = 0 current_line = self.current_line try: c = current_line[self.offset] except IndexError: c = None while count > 0 and c is not None: if c == '\t': chars_to_tab = 4 - (self.column % 4) if columns: self.partially_consumed_tab = chars_to_tab > count chars_to_advance = min(count, chars_to_tab) self.column += chars_to_advance self.offset += 0 if self.partially_consumed_tab else 1 count -= chars_to_advance else: self.partially_consumed_tab = False self.column += chars_to_tab self.offset += 1 self.count -= 1 else: self.partially_consumed_tab = False cols += 1 self.offset += 1 # assume ascii; block starts are ascii self.column += 1 count -= 1 try: c = current_line[self.offset] except IndexError: c = None def incorporate_line(self, ln): """Analyze a line of text and update the document appropriately. We parse markdown text by calling this on each line of input, then finalizing the document. """ all_matched = True container = self.doc self.oldtip = self.tip self.offset = 0 self.column = 0 self.blank = False self.partially_consumed_tab = False self.line_number += 1 # replace NUL characters for security if re.search(r'\u0000', ln) is not None: ln = re.sub(r'\0', '\uFFFD', ln) self.current_line = ln # For each containing block, try to parse the associated line start. # Bail out on failure: container will point to the last matching block. # Set all_matched to false if not all containers match. last_child = container.last_child while last_child and last_child.is_open: container = last_child self.find_next_nonspace() block_class = getattr( import_module('CommonMark.blocks'), to_camel_case(container.t)) rv = block_class.continue_(self, container) if rv == 0: # we've matched, keep going pass elif rv == 1: # we've failed to match a block all_matched = False elif rv == 2: # we've hit end of line for fenced code close and can return self.last_line_length = len(ln) return else: raise ValueError('returned illegal value, must be 0, 1, or 2') if not all_matched: # back up to last matching block container = container.parent break last_child = container.last_child self.all_closed = (container == self.oldtip) self.last_matched_container = container # Check to see if we've hit 2nd blank line; if so break out of list: if self.blank and container.last_line_blank: self.break_out_of_lists(container) container = self.tip block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(container.t)) matched_leaf = container.t != 'paragraph' and block_class.accepts_lines starts = self.block_starts starts_len = len(starts.METHODS) # Unless last matched container is a code block, try new container # starts, adding children to the last matched container: while not matched_leaf: self.find_next_nonspace() # this is a little performance optimization: if not self.indented and \ not re.search(reMaybeSpecial, ln[self.next_nonspace:]): self.advance_next_nonspace() break i = 0 while i < starts_len: res = getattr(starts, starts.METHODS[i])(self, container) if res == 1: container = self.tip break elif res == 2: container = self.tip matched_leaf = True break else: i += 1 if i == starts_len: # nothing matched self.advance_next_nonspace() break # What remains at the offset is a text line. Add the text to the # appropriate container. if not self.all_closed and not self.blank and \ self.tip.t == 'paragraph': # lazy paragraph continuation self.add_line() else: # not a lazy continuation # finalize any blocks not matched self.close_unmatched_blocks() if self.blank and container.last_child: container.last_child.last_line_blank = True t = container.t # Block quote lines are never blank as they start with > # and we don't count blanks in fenced code for purposes of # tight/loose lists or breaking out of lists. We also # don't set last_line_blank on an empty list item, or if we # just closed a fenced block. last_line_blank = self.blank and \ not (t == 'block_quote' or (t == 'code_block' and container.is_fenced) or (t == 'item' and not container.first_child and container.sourcepos[0][0] == self.line_number)) # propagate last_line_blank up through parents: cont = container while cont: cont.last_line_blank = last_line_blank cont = cont.parent block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(t)) if block_class.accepts_lines: self.add_line() # if HtmlBlock, check for end condition if t == 'html_block' and \ container.html_block_type >= 1 and \ container.html_block_type <= 5 and \ re.search( reHtmlBlockClose[container.html_block_type], self.current_line[self.offset:]): self.finalize(container, self.line_number) elif self.offset < len(ln) and not self.blank: # create a paragraph container for one line container = self.add_child('paragraph', self.offset) self.advance_next_nonspace() self.add_line() self.last_line_length = len(ln) def finalize(self, block, line_number): """ Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings, setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference definitions. Reset the tip to the parent of the closed block.""" above = block.parent block.is_open = False block.sourcepos[1] = [line_number, self.last_line_length] block_class = getattr(import_module('CommonMark.blocks'), to_camel_case(block.t)) block_class.finalize(self, block) self.tip = above def process_inlines(self, block): """ Walk through a block & children recursively, parsing string content into inline content where appropriate. """ walker = block.walker() self.inline_parser.refmap = self.refmap self.inline_parser.options = self.options event = walker.nxt() while event is not None: node = event['node'] t = node.t if not event['entering'] and (t == 'paragraph' or t == 'heading'): self.inline_parser.parse(node) event = walker.nxt() def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node('document', [[1, 1], [0, 0]]) self.tip = self.doc self.refmap = {} self.line_number = 0 self.last_line_length = 0 self.offset = 0 self.column = 0 self.last_matched_container = self.doc self.current_line = '' lines = re.split(reLineEnding, my_input) length = len(lines) if len(my_input) > 0 and my_input[-1] == '\n': # ignore last blank line created by final newline length -= 1 for i in range(length): self.incorporate_line(lines[i]) while (self.tip): self.finalize(self.tip, length) self.process_inlines(self.doc) return self.doc
class Parser: def __init__(self, subject=None, pos=0): self.doc = Node.makeNode("Document", 1, 1) self.subject = subject self.pos = pos self.tip = self.doc self.refmap = {} self.inlineParser = InlineParser() def acceptsLines(self, block_type): """ Returns true if block type can accept lines of text.""" return block_type == 'Paragraph' or \ block_type == 'IndentedCode' or \ block_type == 'FencedCode' or \ block_type == 'HtmlBlock' def endsWithBlankLine(self, block): """ Returns true if block ends with a blank line, descending if needed into lists and sublists.""" if block.last_line_blank: return True if (block.t == "List" or block.t == "Item") and \ len(block.children) > 0: return self.endsWithBlankLine(block.children[-1]) else: return False def breakOutOfLists(self, block, line_number): """ Break out of all containing lists, resetting the tip of the document to the parent of the highest list, and finalizing all the lists. (This is used to implement the "two blank lines break out of all lists" feature.)""" b = block last_list = None while True: if (b.t == "List"): last_list = b b = b.parent if not b: break if (last_list): while block != last_list: self.finalize(block, line_number) block = block.parent self.finalize(last_list, line_number) self.tip = last_list.parent def addLine(self, ln, offset): """ Add a line to the block at the tip. We assume the tip can accept lines -- that check should be done before calling this.""" s = ln[offset:] if not self.tip.is_open: raise Exception("Attempted to add line (" + ln + ") to closed container.") self.tip.strings.append(s) def addChild(self, tag, line_number, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" while not (self.tip.t == "Document" or self.tip.t == "BlockQuote" or self.tip.t == "Item" or (self.tip.t == "List" and tag == "Item")): self.finalize(self.tip, line_number - 1) column_number = offset + 1 newNode = Node.makeNode(tag, line_number, column_number) self.tip.children.append(newNode) newNode.parent = self.tip self.tip = newNode return newNode def listsMatch(self, list_data, item_data): """ Returns true if the two list items are of the same type, with the same delimiter and bullet character. This is used in agglomerating list items into lists.""" return (list_data.get("type", None) == item_data.get("type", None) and list_data.get("delimiter", None) == item_data.get( "delimiter", None) and list_data.get("bullet_char", None) == item_data.get( "bullet_char", None)) def parseListMarker(self, ln, offset): """ Parse a list marker and return data on the marker (type, start, delimiter, bullet character, padding) or None.""" rest = ln[offset:] data = {} blank_item = bool() if re.match(reHrule, rest): return None match = re.search(reBulletListMarker, rest) match2 = re.search(reOrderedListMarker, rest) if match: spaces_after_marker = len(match.group(1)) data['type'] = 'Bullet' data['bullet_char'] = match.group(0)[0] blank_item = match.group(0) == len(rest) elif match2: spaces_after_marker = len(match2.group(3)) data['type'] = 'Ordered' data['start'] = int(match2.group(1)) data['delimiter'] = match2.group(2) blank_item = match2.group(0) == len(rest) else: return None if spaces_after_marker >= 5 or spaces_after_marker < 1 or blank_item: if match: data['padding'] = len(match.group(0)) - spaces_after_marker + 1 elif match2: data['padding'] = len( match2.group(0)) - spaces_after_marker + 1 else: if match: data['padding'] = len(match.group(0)) elif match2: data['padding'] = len(match2.group(0)) return data def parseIAL(self, ln): values = [] css_class = re.findall(r"\.(\w+) *", ln) if css_class: values.append(("class", " ".join(css_class))) css_id = re.findall(r"\#.(\w+) *", ln) if css_id: values.append(("id", css_id[0])) keyed_values = re.findall(r"(\w+)(?:=(\w+))? *", ln) if keyed_values: values += keyed_values return dict(values) def incorporateLine(self, ln, line_number): """ Analyze a line of text and update the document appropriately. We parse markdown text by calling this on each line of input, then finalizing the document.""" all_matched = True offset = 0 CODE_INDENT = 4 blank = None already_done = False container = self.doc oldtip = self.tip ln = detabLine(ln) while len(container.children) > 0: last_child = container.children[-1] if not last_child.is_open: break container = last_child match = matchAt(r"[^ ]", ln, offset) if match is None: first_nonspace = len(ln) blank = True else: first_nonspace = match blank = False indent = first_nonspace - offset if container.t == "BlockQuote": matched = bool() if len(ln) > first_nonspace and len(ln) > 0: matched = ln[first_nonspace] == ">" matched = indent <= 3 and matched if matched: offset = first_nonspace + 1 try: if ln[offset] == " ": offset += 1 except IndexError: pass else: all_matched = False elif container.t == "Item": if (indent >= container.list_data['marker_offset'] + container.list_data['padding']): offset += container.list_data[ 'marker_offset'] + container.list_data['padding'] elif blank: offset = first_nonspace else: all_matched = False elif container.t == "IndentedCode": if indent >= CODE_INDENT: offset += CODE_INDENT elif blank: offset = first_nonspace else: all_matched = False elif container.t in [ "ATXHeader", "SetextHeader", "HorizontalRule" ]: all_matched = False elif container.t == "FencedCode": i = container.fence_offset while i > 0 and len(ln) > offset and ln[offset] == " ": offset += 1 i -= 1 elif container.t == "HtmlBlock": if blank: all_matched = False elif container.t == "Paragraph": if blank: container.last_line_blank = True all_matched = False if not all_matched: container = container.parent break last_matched_container = container def closeUnmatchedBlocks(self, already_done, oldtip): """ This function is used to finalize and close any unmatched blocks. We aren't ready to do this now, because we might have a lazy paragraph continuation, in which case we don't want to close unmatched blocks. So we store this closure for use later, when we have more information.""" while not already_done and not oldtip == last_matched_container: self.finalize(oldtip, line_number) oldtip = oldtip.parent return True, oldtip if blank and container.last_line_blank: self.breakOutOfLists(container, line_number) while container.t != "ExtensionBlock" and \ container.t != "FencedCode" and \ container.t != "IndentedCode" and \ container.t != "HtmlBlock" and \ matchAt(r"^[ #`~*+_=<>0-9-{]", ln, offset) is not None: match = matchAt("[^ ]", ln, offset) if match is None: first_nonspace = len(ln) blank = True else: first_nonspace = match blank = False ATXmatch = re.search(reATXHeaderMarker, ln[first_nonspace:]) FENmatch = re.search(reCodeFence, ln[first_nonspace:]) PARmatch = re.search(reSetextHeaderLine, ln[first_nonspace:]) IALmatch = re.search(r"^{:((\}|[^}])*)} *$", ln[first_nonspace:]) EXTmatch = re.search(r"^{::((\\\}|[^\\}])*)/?} *$", ln[first_nonspace:]) data = self.parseListMarker(ln, first_nonspace) indent = first_nonspace - offset if data: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) data['marker_offset'] = indent offset = first_nonspace + data['padding'] if not container.t == "List" or not self.listsMatch( container.list_data, data): container = self.addChild("List", line_number, first_nonspace) container.list_data = data container = self.addChild("Item", line_number, first_nonspace) container.list_data = data elif indent >= CODE_INDENT: if not self.tip.t == "Paragraph" and not blank: offset += CODE_INDENT already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild('IndentedCode', line_number, offset) else: break elif len(ln) > first_nonspace and ln[first_nonspace] == ">": offset = first_nonspace + 1 try: if ln[offset] == " ": offset += 1 except IndexError: pass already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild("BlockQuote", line_number, offset) elif EXTmatch: args = EXTmatch.group(1) keyed_values = re.findall(r"(\w+)(?:=(\w+))? *", args) offset = first_nonspace + len(EXTmatch.group(0)) print("EXT {} {}".format(args, offset)) already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild("ExtensionBlock", line_number, first_nonspace) container.title = keyed_values.pop(0)[0] container.attributes = dict(keyed_values) print(EXTmatch.group(0)) print(args) if (EXTmatch.group(0)[-2] == '/'): self.finalize(container, line_number) break elif IALmatch: offset = first_nonspace + len(IALmatch.group(0)) print("Found {}".format(IALmatch.group(0))) print("blank {}".format(blank)) print("container {} {}".format(self.tip.t, container.last_line_blank)) if blank: # FIXME # attributes.update(self.parseIAL(IALmatch.group(1))) pass else: self.tip.attributes = self.parseIAL(IALmatch.group(1)) break elif ATXmatch: offset = first_nonspace + len(ATXmatch.group(0)) already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild("ATXHeader", line_number, first_nonspace) container.level = len(ATXmatch.group(0).strip()) if not re.search(r'\\#', ln[offset:]) is None: container.strings = [ re.sub(r'(?:(\\#) *#*| *#+) *$', '\g<1>', ln[offset:]) ] else: container.strings = [ re.sub(r'(?:(\\#) *#*| *#+) *$', '', ln[offset:]) ] break elif FENmatch: fence_length = len(FENmatch.group(0)) already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild("FencedCode", line_number, first_nonspace) container.fence_length = fence_length container.fence_char = FENmatch.group(0)[0] container.fence_offset = first_nonspace - offset offset = first_nonspace + fence_length break elif not matchAt(reHtmlBlockOpen, ln, first_nonspace) is None: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild('HtmlBlock', line_number, first_nonspace) break elif container.t == "Paragraph" and \ len(container.strings) == 1 and PARmatch: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container.t = "SetextHeader" container.level = 1 if PARmatch.group(0)[0] == '=' else 2 offset = len(ln) elif not matchAt(reHrule, ln, first_nonspace) is None: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container = self.addChild("HorizontalRule", line_number, first_nonspace) offset = len(ln) - 1 break else: break if self.acceptsLines(container.t): break match = matchAt(r"[^ ]", ln, offset) if match is None: first_nonspace = len(ln) blank = True else: first_nonspace = match blank = False indent = first_nonspace - offset if not self.tip == last_matched_container and \ not blank and self.tip.t == "Paragraph" and \ len(self.tip.strings) > 0: self.last_line_blank = False self.addLine(ln, offset) else: already_done, oldtip = closeUnmatchedBlocks( self, already_done, oldtip) container.last_line_blank = \ blank and \ not (container.t == "BlockQuote" or container.t == "FencedCode" or (container.t == "Item" and len(container.children) == 0 and container.start_line == line_number)) cont = container while cont.parent: cont.parent.last_line_blank = False cont = cont.parent if container.t == "IndentedCode" or container.t == "HtmlBlock": self.addLine(ln, offset) if container.t == "ExtensionBlock": EXTmatch = re.search(r"^{:/((\\\}|[^\\}])*)} *$", ln[first_nonspace:]) if EXTmatch: self.finalize(container, line_number) else: self.addLine(ln, offset) elif container.t == "FencedCode": match = bool() if len(ln) > 0: match = len(ln) > first_nonspace and \ ln[first_nonspace] == container.fence_char and \ re.match( r"^(?:`{3,}|~{3,})(?= *$)", ln[first_nonspace:]) match = indent <= 3 and match FENmatch = re.search(r"^(?:`{3,}|~{3,})(?= *$)", ln[first_nonspace:]) if match and len(FENmatch.group(0)) >= container.fence_length: self.finalize(container, line_number) else: self.addLine(ln, offset) elif container.t in ["ATXHeader", "SetextHeader", "HtmlBlock"]: # nothing to do; we already added the contents. pass else: if self.acceptsLines(container.t): self.addLine(ln, first_nonspace) elif blank: pass elif container.t != "HorizontalRule" and \ container.t != "SetextHeader": container = self.addChild("Paragraph", line_number, first_nonspace) self.addLine(ln, first_nonspace) else: # print("Line " + str(line_number) + # " with container type " + # container.t + " did not match any condition.") pass def finalize(self, block, line_number): """ Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings, setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference definitions. Reset the tip to the parent of the closed block.""" if (not block.is_open): return 0 block.is_open = False if (line_number > block.start_line): block.end_line = line_number - 1 else: block.end_line = line_number if (block.t == "Paragraph"): block.string_content = "" for i, line in enumerate(block.strings): block.strings[i] = re.sub(r'^ *', '', line, re.MULTILINE) block.string_content = '\n'.join(block.strings) pos = self.inlineParser.parseReference(block.string_content, self.refmap) while (block.string_content[0] == "[" and pos): block.string_content = block.string_content[pos:] if (isBlank(block.string_content)): block.t = "ReferenceDef" break pos = self.inlineParser.parseReference(block.string_content, self.refmap) elif (block.t in ["ATXHeader", "SetextHeader", "HtmlBlock"]): block.string_content = "\n".join(block.strings) elif (block.t == "IndentedCode"): block.string_content = re.sub(r"(\n *)*$", "\n", "\n".join(block.strings)) elif (block.t == "FencedCode"): block.info = unescape(block.strings[0].strip()) if (len(block.strings) == 1): block.string_content = "" else: block.string_content = "\n".join(block.strings[1:]) + "\n" elif (block.t == "List"): block.tight = True numitems = len(block.children) i = 0 while (i < numitems): item = block.children[i] last_item = (i == numitems - 1) if (self.endsWithBlankLine(item) and not last_item): block.tight = False break numsubitems = len(item.children) j = 0 while (j < numsubitems): subitem = item.children[j] last_subitem = j == (numsubitems - 1) if (self.endsWithBlankLine(subitem) and not (last_item and last_subitem)): block.tight = False break j += 1 i += 1 else: pass self.tip = block.parent def processInlines(self, block): """ Walk through a block & children recursively, parsing string content into inline content where appropriate.""" if block.t in ["ATXHeader", "Paragraph", "SetextHeader"]: block.inline_content = self.inlineParser.parse( block.string_content.strip(), self.refmap) block.string_content = "" if block.children: for i in block.children: self.processInlines(i) def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node.makeNode("Document", 1, 1) self.tip = self.doc self.refmap = {} lines = re.split(reLineEnding, re.sub(r'\n$', '', my_input)) length = len(lines) for i in range(length): self.incorporateLine(lines[i], i + 1) while (self.tip): self.finalize(self.tip, length) self.processInlines(self.doc) return self.doc