def parseHeaders(source): """ Parse headers to construct Table Of Contents return: [(level, text, position, anchor)] position/anchor is the header position in notesEdit/notesView """ hdrs = [] headers = [] used_ids = set() # In case there are headers with the same name. # hash headers RE = re.compile(r'^(#+)(.+)', re.MULTILINE) for m in RE.finditer(source): level = len(m.group(1)) hdr = m.group(2) pos = m.start() hdrs.append((pos, level, hdr)) # setext headers RE = re.compile(r'(.+)\n([=-]+[ ]*)(\n|$)', re.MULTILINE) for m in RE.finditer(source): if m.group(2).startswith('='): level = 1 else: level = 2 hdr = m.group(1) pos = m.start() hdrs.append((pos, level, hdr)) hdrs.sort() for (p, l, h) in hdrs: anchor = unique(slugify(h, '-'), used_ids) headers.append((l, h, p, anchor)) return headers
def parseHeaders(source): ''' Quite basic header parser Headers are used to construct Table Of Contents return: [(hdrLevel, hdrText, hdrPosition, hdrAnchor)] ''' # RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)') hdrs = [] used_ids = set() # In case there are headers with the same name. RE = re.compile(r'^(#+)(.+)', re.MULTILINE) for m in RE.finditer(source): hdrLevel = m.group(1) hdr = m.group(2) pos = m.start() anchor = unique(slugify(hdr, '-'), used_ids) hdrs.append((hdrLevel, hdr, pos, anchor)) return hdrs
def testUniqueFunc(self): """ Test 'unique' function. """ from markdown.extensions.headerid import unique ids = set(['foo']) self.assertEqual(unique('foo', ids), 'foo_1') self.assertEqual(ids, set(['foo', 'foo_1']))
def run(self, doc): div = etree.Element("div") div.attrib["class"] = "toc" header_rgx = re.compile("[Hh][123456]") self.use_anchors = self.config["anchorlink"] in [1, '1', True, 'True', 'true'] # Get a list of id attributes used_ids = set() for c in doc.getiterator(): if "id" in c.attrib: used_ids.add(c.attrib["id"]) toc_list = [] marker_found = False for (p, c) in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue # To keep the output from screwing up the # validation by putting a <div> inside of a <p> # we actually replace the <p> in its entirety. # We do not allow the marker inside a header as that # would causes an enless loop of placing a new TOC # inside previously generated TOC. if c.text and c.text.strip() == self.config["marker"] and \ not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']: for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): # Do not override pre-existing ids if not "id" in c.attrib: elem_id = unique(self.config["slugify"](text, '-'), used_ids) c.attrib["id"] = elem_id else: elem_id = c.attrib["id"] tag_level = int(c.tag[-1]) toc_list.append({ 'level': tag_level, 'id': elem_id, 'name': c.text}) self.add_anchor(c, elem_id) if marker_found: toc_list_nested = order_toc_list(toc_list) self.build_toc_etree(div, toc_list_nested) # serialize and attach to markdown instance. prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def run(self, doc): marker_found = False div = etree.Element("div") div.attrib["class"] = "toc" last_li = None # Add title to the div if self.config["title"]: header = etree.SubElement(div, "span") header.attrib["class"] = "toctitle" header.text = self.config["title"] level = 0 list_stack=[div] header_rgx = re.compile("[Hh][123456]") # Get a list of id attributes used_ids = [] for c in doc.getiterator(): if "id" in c.attrib: used_ids.append(c.attrib["id"]) for (p, c) in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue # To keep the output from screwing up the # validation by putting a <div> inside of a <p> # we actually replace the <p> in its entirety. # We do not allow the marker inside a header as that # would causes an enless loop of placing a new TOC # inside previously generated TOC. if c.text and c.text.strip() == self.config["marker"] and \ not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']: for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): tag_level = int(c.tag[-1]) while tag_level < level: list_stack.pop() level -= 1 if tag_level > level: if self.config['ordered']: newlist = etree.Element("ol") else: newlist = etree.Element("ul") if last_li: last_li.append(newlist) else: list_stack[-1].append(newlist) list_stack.append(newlist) if level == 0: level = tag_level else: level += 1 # Do not override pre-existing ids if not "id" in c.attrib: id = unique(self.config["slugify"](text, '-'), used_ids) c.attrib["id"] = id else: id = c.attrib["id"] # List item link, to be inserted into the toc div last_li = etree.Element("li") link = etree.SubElement(last_li, "a") link.text = text link.attrib["href"] = '#' + id if int(self.config["anchorlink"]): anchor = etree.Element("a") anchor.text = c.text anchor.attrib["href"] = "#" + id anchor.attrib["class"] = "toclink" c.text = "" for elem in c.getchildren(): anchor.append(elem) c.remove(elem) c.append(anchor) list_stack[-1].append(last_li) if not marker_found: # searialize and attach to markdown instance. prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def run(self, doc): marker_found = False div = etree.Element("div") div.attrib["class"] = "toc" last_li = None # Add title to the div if self.config["title"]: header = etree.SubElement(div, "span") header.attrib["class"] = "toctitle" header.text = self.config["title"] level = 0 list_stack = [div] header_rgx = re.compile("[Hh][123456]") # Get a list of id attributes used_ids = [] for c in doc.getiterator(): if "id" in c.attrib: used_ids.append(c.attrib["id"]) for (p, c) in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue # To keep the output from screwing up the # validation by putting a <div> inside of a <p> # we actually replace the <p> in its entirety. # We do not allow the marker inside a header as that # would causes an enless loop of placing a new TOC # inside previously generated TOC. if c.text and c.text.strip() == self.config["marker"] and \ not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']: for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): try: tag_level = int(c.tag[-1]) while tag_level < level: list_stack.pop() level -= 1 if tag_level > level: newlist = etree.Element("ul") if last_li: last_li.append(newlist) else: list_stack[-1].append(newlist) list_stack.append(newlist) if level == 0: level = tag_level else: level += 1 # Do not override pre-existing ids if not "id" in c.attrib: id = unique(self.config["slugify"](text, '-'), used_ids) c.attrib["id"] = id else: id = c.attrib["id"] # List item link, to be inserted into the toc div last_li = etree.Element("li") link = etree.SubElement(last_li, "a") link.text = text link.attrib["href"] = '#' + id if self.config["anchorlink"] in [ 1, '1', True, 'True', 'true' ]: anchor = etree.Element("a") anchor.text = c.text anchor.attrib["href"] = "#" + id anchor.attrib["class"] = "toclink" c.text = "" for elem in c.getchildren(): anchor.append(elem) c.remove(elem) c.append(anchor) list_stack[-1].append(last_li) except IndexError: # We have bad ordering of headers. Just move on. pass if not marker_found: # searialize and attach to markdown instance. prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def run(self, doc): marker_found = False div = etree.Element('div') div.attrib['class'] = 'toc' last_li = None if self.config['title']: header = etree.SubElement(div, 'span') header.attrib['class'] = 'toctitle' header.text = self.config['title'] level = 0 list_stack = [div] header_rgx = re.compile('[Hh][123456]') used_ids = [] for c in doc.getiterator(): if 'id' in c.attrib: used_ids.append(c.attrib['id']) for p, c in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue if c.text and c.text.strip() == self.config['marker'] and not header_rgx.match(c.tag) and c.tag not in ('pre', 'code'): for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): try: tag_level = int(c.tag[-1]) while tag_level < level: list_stack.pop() level -= 1 if tag_level > level: newlist = etree.Element('ul') if last_li: last_li.append(newlist) else: list_stack[-1].append(newlist) list_stack.append(newlist) if level == 0: level = tag_level else: level += 1 if 'id' not in c.attrib: id = unique(self.config['slugify'](text, '-'), used_ids) c.attrib['id'] = id else: id = c.attrib['id'] last_li = etree.Element('li') link = etree.SubElement(last_li, 'a') link.text = text link.attrib['href'] = '#' + id if self.config['anchorlink'] in [1, '1', True, 'True', 'true']: anchor = etree.Element('a') anchor.text = c.text anchor.attrib['href'] = '#' + id anchor.attrib['class'] = 'toclink' c.text = '' for elem in c._children: anchor.append(elem) c.remove(elem) c.append(anchor) list_stack[-1].append(last_li) except IndexError: pass if not marker_found: prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def run(self, doc): marker_found = False div = etree.Element('div') div.attrib['class'] = 'toc' last_li = None if self.config['title']: header = etree.SubElement(div, 'span') header.attrib['class'] = 'toctitle' header.text = self.config['title'] level = 0 list_stack = [div] header_rgx = re.compile('[Hh][123456]') used_ids = [] for c in doc.getiterator(): if 'id' in c.attrib: used_ids.append(c.attrib['id']) for p, c in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue if c.text and c.text.strip( ) == self.config['marker'] and not header_rgx.match( c.tag) and c.tag not in ('pre', 'code'): for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): try: tag_level = int(c.tag[-1]) while tag_level < level: list_stack.pop() level -= 1 if tag_level > level: newlist = etree.Element('ul') if last_li: last_li.append(newlist) else: list_stack[-1].append(newlist) list_stack.append(newlist) if level == 0: level = tag_level else: level += 1 if 'id' not in c.attrib: id = unique(self.config['slugify'](text, '-'), used_ids) c.attrib['id'] = id else: id = c.attrib['id'] last_li = etree.Element('li') link = etree.SubElement(last_li, 'a') link.text = text link.attrib['href'] = '#' + id if self.config['anchorlink'] in [ 1, '1', True, 'True', 'true' ]: anchor = etree.Element('a') anchor.text = c.text anchor.attrib['href'] = '#' + id anchor.attrib['class'] = 'toclink' c.text = '' for elem in c._children: anchor.append(elem) c.remove(elem) c.append(anchor) list_stack[-1].append(last_li) except IndexError: pass if not marker_found: prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def run(self, doc): marker_found = False div = etree.Element("div") div.attrib["class"] = "toc" last_li = None # Add title to the div if self.config["title"]: header = etree.SubElement(div, "span") header.attrib["class"] = "toctitle" header.text = self.config["title"] level = 0 list_stack = [div] header_rgx = re.compile("[Hh][123456]") # Get a list of id attributes used_ids = [] for c in doc.getiterator(): if "id" in c.attrib: used_ids.append(c.attrib["id"]) last_numbers = [0] * 5 last_tag_level = 1 last_level = -1 for (p, c) in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue # To keep the output from screwing up the # validation by putting a <div> inside of a <p> # we actually replace the <p> in its entirety. # We do not allow the marker inside a header as that # would causes an endless loop of placing a new TOC # inside a previously generated TOC. if c.text and c.text.strip() == self.config["marker"] and \ not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']: for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): try: tag_level = int(c.tag[-1]) level_diff = last_tag_level - tag_level if tag_level > last_tag_level: this_level = last_level + 1 # Mimic the behaviour of the table of contents if level_diff < 1: tag_level = last_tag_level + 1 elif tag_level < last_tag_level: # If the tag difference is MORE than one, go up that many if level_diff > 1: this_level = last_level - level_diff else: this_level = last_level - 1 for i in xrange(this_level + 1, len(last_numbers)): last_numbers[i] = 0 else: this_level = last_level last_numbers[this_level] += 1 section_number = '.'.join( map(unicode, last_numbers[:this_level + 1])) placeholder = self.markdown.htmlStash.store( u'<span>%s</span>' % section_number, safe=True) last_tag_level = tag_level last_level = this_level while tag_level < level: list_stack.pop() level -= 1 if tag_level > level: newlist = etree.Element("ul") if last_li: last_li.append(newlist) else: list_stack[-1].append(newlist) list_stack.append(newlist) if level == 0: level = tag_level else: level += 1 id = unique(self.config["slugify"](text, '-'), used_ids) # List item link, to be inserted into the toc div last_li = etree.Element("li") link = etree.SubElement(last_li, "a") link.text = c.text link.attrib["href"] = '#' + id pilcrow_html = u'<a class="headerlink" name="%(id)s" href="#%(id)s">¶</a>' % { 'id': id } header_link = self.markdown.htmlStash.store(pilcrow_html, safe=True) c.text = c.text + header_link c.attrib['class'] = 'header' if self.config["anchorlink"] in [ 1, '1', True, 'True', 'true' ]: anchor = etree.Element("a") anchor.text = c.text anchor.attrib["href"] = "#" + id anchor.attrib["class"] = "toclink" c.text = "" for elem in c.getchildren(): anchor.append(elem) c.remove(elem) c.append(anchor) list_stack[-1].append(last_li) except IndexError: # We have bad ordering of headers. Just move on. pass if not marker_found: # serialize and attach to markdown instance. prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def run(self, doc): marker_found = False div = etree.Element("div") div.attrib["class"] = "toc" last_li = None # Add title to the div if self.config["title"]: header = etree.SubElement(div, "span") header.attrib["class"] = "toctitle" header.text = self.config["title"] level = 0 list_stack=[div] header_rgx = re.compile("[Hh][123456]") # Get a list of id attributes used_ids = [] for c in doc.getiterator(): if "id" in c.attrib: used_ids.append(c.attrib["id"]) last_numbers = [0] * 5 last_tag_level = 0 last_level = -1 for (p, c) in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue # To keep the output from screwing up the # validation by putting a <div> inside of a <p> # we actually replace the <p> in its entirety. # We do not allow the marker inside a header as that # would causes an endless loop of placing a new TOC # inside a previously generated TOC. if c.text and c.text.strip() == self.config["marker"] and \ not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']: for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): try: tag_level = int(c.tag[-1]) level_diff = last_tag_level - tag_level if tag_level > last_tag_level: this_level = last_level + 1 # Mimic the behaviour of the table of contents if level_diff < 1: tag_level = last_tag_level + 1 elif tag_level < last_tag_level: # If the tag difference is MORE than one, go up that many if level_diff > 1: this_level = last_level - level_diff else: this_level = last_level - 1 for i in xrange(this_level + 1, len(last_numbers)): last_numbers[i] = 0 else: this_level = last_level last_numbers[this_level] += 1 section_number = '.'.join(map(unicode, last_numbers[:this_level+1])) placeholder = self.markdown.htmlStash.store(u'<i>%s</i>' % section_number, safe=True) last_tag_level = tag_level last_level = this_level while tag_level < level: list_stack.pop() level -= 1 if tag_level > level: newlist = etree.Element("ul") if last_li: last_li.append(newlist) else: list_stack[-1].append(newlist) list_stack.append(newlist) if level == 0: level = tag_level else: level += 1 id = unique(self.config["slugify"](text, '-'), used_ids) # List item link, to be inserted into the toc div last_li = etree.Element("li") link = etree.SubElement(last_li, "a") link.text = '%s %s' % (section_number, text) link.attrib["href"] = '#' + id # Needed in case the header contains subelements c.clear() c.text = placeholder + text pilcrow = etree.SubElement(c, "a") pilcrow.set('class', 'headerlink') pilcrow.set('name', id) pilcrow.set('href', '#%s' % id) pilcrow.text = self.markdown.htmlStash.store('¶', safe=True) c.attrib['class'] = 'header' if self.config["anchorlink"] in [1, '1', True, 'True', 'true']: anchor = etree.Element("a") anchor.text = c.text anchor.attrib["href"] = "#" + id anchor.attrib["class"] = "toclink" c.text = "" for elem in c.getchildren(): anchor.append(elem) c.remove(elem) c.append(anchor) list_stack[-1].append(last_li) except IndexError: # We have bad ordering of headers. Just move on. pass if not marker_found: # serialize and attach to markdown instance. prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def parseHeaders(source, strip_fenced_block=False, strip_ascii_math=False): """ Parse headers to construct Table Of Contents return: [(level, text, position, anchor)] position/anchor is the header position in notesEdit/notesView """ hdrs = [] headers = [] used_ids = set() # In case there are headers with the same name. # copied from the asciimathml so we don't have to have a hard dependency to strip ASCIIMATHML_RE = re.compile(r'^(.*)\$\$([^\$]*)\$\$(.*)$', re.M) # copied from the fenced block code FENCED_BLOCK_RE = re.compile( r''' (?P<fence>^(?:~{3,}|`{3,}))[ ]* # Opening ``` or ~~~ (\{?\.?(?P<lang>[a-zA-Z0-9_+-]*))?[ ]* # Optional {, and lang # Optional highlight lines, single- or double-quote-delimited (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot))?[ ]* }?[ ]*\n # Optional closing } (?P<code>.*?)(?<=\n) (?P=fence)[ ]*$''', re.MULTILINE | re.DOTALL | re.VERBOSE) filtered_source = source #if applicable, strip out any trouble text before we start parsing the headers if strip_fenced_block: m = FENCED_BLOCK_RE.search(filtered_source) while m is not None: nfillers = (m.end() - m.start()) filtered_source = FENCED_BLOCK_RE.sub("\n" * nfillers, filtered_source, count=1) m = FENCED_BLOCK_RE.search(filtered_source) if strip_ascii_math: m = ASCIIMATHML_RE.search(filtered_source) while m is not None: nfillers = (m.end() - m.start()) filtered_source = ASCIIMATHML_RE.sub("\n" * nfillers, filtered_source, count=1) m = ASCIIMATHML_RE.search(filtered_source) # hash headers RE = re.compile(r'^(#+)(.+)', re.MULTILINE) for m in RE.finditer(filtered_source): level = len(m.group(1)) hdr = m.group(2) pos = m.start() hdrs.append((pos, level, hdr)) # setext headers RE = re.compile(r'(.+)\n([=-]+[ ]*)(\n|$)', re.MULTILINE) for m in RE.finditer(filtered_source): if m.group(2).startswith('='): level = 1 else: level = 2 hdr = m.group(1) pos = m.start() hdrs.append((pos, level, hdr)) hdrs.sort() for (p, l, h) in hdrs: anchor = unique(slugify(h, '-'), used_ids) headers.append((l, h, p, anchor)) return headers
def run(self, doc): div = etree.Element("div") last_li = None level = 0 list_stack=[div] header_rgx = re.compile("[Hh][123456]") # Get a list of id attributes used_ids = [] for c in doc.getiterator(): if "id" in c.attrib: used_ids.append(c.attrib["id"]) for (p, c) in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue if header_rgx.match(c.tag): try: tag_level = int(c.tag[-1]) while tag_level < level: list_stack.pop() level -= 1 if tag_level > level: newlist = etree.Element("ul") if last_li: last_li.append(newlist) else: list_stack[-1].append(newlist) list_stack.append(newlist) if level == 0: level = tag_level else: level += 1 # Do not override pre-existing ids if not "id" in c.attrib: id = unique(self.config["slugify"](text, '-'), used_ids) c.attrib["id"] = id else: id = c.attrib["id"] # List item link, to be inserted into the toc last_li = etree.Element("li") link = etree.SubElement(last_li, "a") link.text = text link.attrib["href"] = '#' + id if self.config["anchorlink"]: anchor = etree.Element("a") anchor.text = c.text anchor.attrib["href"] = "#" + id anchor.attrib["class"] = "toclink" c.text = "" for elem in c.getchildren(): anchor.append(elem) c.remove(elem) c.append(anchor) list_stack[-1].append(last_li) except IndexError: # We have bad ordering of headers. Just move on. pass # searialize and attach to markdown instance. ul = div.find('ul') if not ul: self.markdown.toc = '' return ul.attrib["class"] = "toc" toc = self.markdown.serializer(ul) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def run(self, doc): div = etree.Element("div") p_title = etree.Element("p") p_title.attrib["class"] = "first sidebar-title" p_title.text = "Contents" div.append(p_title) div.attrib["class"] = "contents alert alert-info pull-left topic" header_rgx = re.compile("[Hh][123456]") self.use_anchors = self.config["anchorlink"] in [1, '1', True, 'True', 'true'] # Get a list of id attributes used_ids = set() for c in doc.getiterator(): if "id" in c.attrib: used_ids.add(c.attrib["id"]) toc_list = [] marker_found = False for (p, c) in self.iterparent(doc): text = ''.join(itertext(c)).strip() if not text: continue # To keep the output from screwing up the # validation by putting a <div> inside of a <p> # we actually replace the <p> in its entirety. # We do not allow the marker inside a header as that # would causes an enless loop of placing a new TOC # inside previously generated TOC. if c.text and c.text.strip() == self.config["marker"] and \ not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']: for i in range(len(p)): if p[i] == c: p[i] = div break marker_found = True if header_rgx.match(c.tag): # Do not override pre-existing ids if not "id" in c.attrib: elem_id = unique(self.config["slugify"](text, '-'), used_ids) c.attrib["id"] = elem_id else: elem_id = c.attrib["id"] tag_level = int(c.tag[-1]) toc_list.append({'level': tag_level, 'id': elem_id, 'name': text}) self.add_anchor(c, elem_id) toc_list_nested = order_toc_list(toc_list) self.build_toc_etree(div, toc_list_nested) prettify = self.markdown.treeprocessors.get('prettify') if prettify: prettify.run(div) if not marker_found: # serialize and attach to markdown instance. toc = self.markdown.serializer(div) for pp in self.markdown.postprocessors.values(): toc = pp.run(toc) self.markdown.toc = toc
def parseHeaders(source, strip_fenced_block=False, strip_ascii_math=False): """ Parse headers to construct Table Of Contents return: [(level, text, position, anchor)] position/anchor is the header position in notesEdit/notesView """ hdrs = [] headers = [] used_ids = set() # In case there are headers with the same name. # copied from the asciimathml so we don't have to have a hard dependency to strip ASCIIMATHML_RE = re.compile(r'^(.*)\$\$([^\$]*)\$\$(.*)$', re.M) # copied from the fenced block code FENCED_BLOCK_RE = re.compile(r''' (?P<fence>^(?:~{3,}|`{3,}))[ ]* # Opening ``` or ~~~ (\{?\.?(?P<lang>[a-zA-Z0-9_+-]*))?[ ]* # Optional {, and lang # Optional highlight lines, single- or double-quote-delimited (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot))?[ ]* }?[ ]*\n # Optional closing } (?P<code>.*?)(?<=\n) (?P=fence)[ ]*$''', re.MULTILINE | re.DOTALL | re.VERBOSE) filtered_source = source #if applicable, strip out any trouble text before we start parsing the headers if strip_fenced_block: m = FENCED_BLOCK_RE.search(filtered_source) while m is not None: nfillers = (m.end() - m.start()) filtered_source = FENCED_BLOCK_RE.sub("\n"*nfillers, filtered_source, count=1) m = FENCED_BLOCK_RE.search(filtered_source) if strip_ascii_math: m = ASCIIMATHML_RE.search(filtered_source) while m is not None: nfillers = (m.end() - m.start()) filtered_source = ASCIIMATHML_RE.sub("\n"*nfillers, filtered_source, count=1) m = ASCIIMATHML_RE.search(filtered_source) # hash headers RE = re.compile(r'^(#+)(.+)', re.MULTILINE) for m in RE.finditer(filtered_source): level = len(m.group(1)) hdr = m.group(2) pos = m.start() hdrs.append((pos, level, hdr)) # setext headers RE = re.compile(r'(.+)\n([=-]+[ ]*)(\n|$)', re.MULTILINE) for m in RE.finditer(filtered_source): if m.group(2).startswith('='): level = 1 else: level = 2 hdr = m.group(1) pos = m.start() hdrs.append((pos, level, hdr)) hdrs.sort() for (p, l, h) in hdrs: anchor = unique(slugify(h, '-'), used_ids) headers.append((l, h, p, anchor)) return headers