def html2rst(text, images_dir): """Converts html, tipically generated by tinyMCE, into rst compatible with Sage documentation. The main job is done by BeautifulSoup, which is much more robust than conventional parsers like HTMLParser, but also several details specific of this context are taken into account, so this code differs from generic approaches like those found on the web. INPUT: - ``text`` -- string -- a chunk of HTML text - ``images_dir`` -- string -- folder where images are stored OUTPUT: - string -- rst text EXAMPLES:: sage: from sagenb.misc.comments2rst import html2rst # optional - beautifulsoup sage: html2rst('<p>Some text with <em>math</em>: $e^{\pi i}=-1$</p>', '') # optional - beautifulsoup u'Some text with *math* : :math:`e^{\\pi i}=-1`\n\n' sage: html2rst('<p>Text with <em>incorrect</p> nesting</em>.', '') # optional - beautifulsoup u'Text with *incorrect* \n\n nesting\n.' sage: html2rst('<pre>Preformatted: \n a+2\n</pre><p> Not preformatted: \n a+2\n</p>', '') # optional - beautifulsoup u'::\n\n Preformatted: \n a+2\n \n Not preformatted: a\\+2\n\n' sage: html2rst('áñ ñá','') # optional - beautifulsoup u'\xe1\xf1 \xf1\xe1' sage: html2rst('<p>some text</p><p>$$</p><p>3.183098861 \cdot 10^{-1}</p><p>$$</p>','') # optional - beautifulsoup u'some text\n\n.. MATH::\n\n 3.183098861 \\cdot 10^{-1}\n\n.. end of math\n\n' """ #replace $$some display latex$$ with #<display>some display latex</display> text = preprocess_display_latex(text) #eliminate nasty text = text.replace(' ',' ') #ICantBelieveItsBeautifulSoup is better than BeautifulSoup #for html that wasn't generated by humans (like tinyMCE) soup = ICantBelieveItsBeautifulSoup(text, convertEntities=ICantBelieveItsBeautifulSoup.ALL_ENTITIES) #remove all comments comments = soup.findAll(text=lambda text:isinstance(text, Comment)) for comment in comments: comment.extract() # replace_courier(soup) replace_latex(soup) v = Soup2Rst(images_dir) # return v.visit(soup) text = v.visit(soup) more_than_2_blank_lines = re.compile(r'\n\n+', re.MULTILINE) text = more_than_2_blank_lines.sub('\n\n', text) text = replace_xml_entities(text) return text
def html2rst(text, images_dir): """Converts html, tipically generated by tinyMCE, into rst compatible with Sage documentation. The main job is done by BeautifulSoup, which is much more robust than conventional parsers like HTMLParser, but also several details specific of this context are taken into account, so this code differs from generic approaches like those found on the web. INPUT: - ``text`` -- string -- a chunk of HTML text - ``images_dir`` -- string -- folder where images are stored OUTPUT: - string -- rst text EXAMPLES:: sage: from sagenb.misc.comments2rst import html2rst # optional - beautifulsoup sage: html2rst('<p>Some text with <em>math</em>: $e^{\pi i}=-1$</p>', '') # optional - beautifulsoup u'Some text with *math* : :math:`e^{\\pi i}=-1`\n\n' sage: html2rst('<p>Text with <em>incorrect</p> nesting</em>.', '') # optional - beautifulsoup u'Text with *incorrect* \n\n nesting\n.' sage: html2rst('<pre>Preformatted: \n a+2\n</pre><p> Not preformatted: \n a+2\n</p>', '') # optional - beautifulsoup u'::\n\n Preformatted: \n a+2\n \n Not preformatted: a\\+2\n\n' sage: html2rst('áñ ñá','') # optional - beautifulsoup u'\xe1\xf1 \xf1\xe1' sage: html2rst('<p>some text</p><p>$$</p><p>3.183098861 \cdot 10^{-1}</p><p>$$</p>','') # optional - beautifulsoup u'some text\n\n.. MATH::\n\n 3.183098861 \\cdot 10^{-1}\n\n.. end of math\n\n' """ #replace $$some display latex$$ with #<display>some display latex</display> text = preprocess_display_latex(text) #eliminate nasty text = text.replace(' ', ' ') #ICantBelieveItsBeautifulSoup is better than BeautifulSoup #for html that wasn't generated by humans (like tinyMCE) soup = ICantBelieveItsBeautifulSoup( text, convertEntities=ICantBelieveItsBeautifulSoup.ALL_ENTITIES) #remove all comments comments = soup.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() # replace_courier(soup) replace_latex(soup) v = Soup2Rst(images_dir) # return v.visit(soup) text = v.visit(soup) more_than_2_blank_lines = re.compile(r'\n\n+', re.MULTILINE) text = more_than_2_blank_lines.sub('\n\n', text) text = replace_xml_entities(text) return text
def typygmentdown(text, **kwargs): """ Given a string of text using Markdown syntax, applies the following transformations: 1. Searches out and temporarily removes any raw ``<code>`` elements in the text. 2. Applies Markdown and typogrify to the remaining text. 3. Applies Pygments highlighting to the contents of the removed ``<code>`` elements. 4. Re-inserts the ``<code>`` elements and returns the result. The Pygments lexer to be used for highlighting is determined by the ``class`` attribute of each ``<code>`` element found; if none is present, it will attempt to guess the correct lexer before falling back on plain text. The following keyword arguments are understood and passed to markdown if found: * ``extensions`` Markdown's ``safe_mode`` argument is *not* passed on, because it would cause the temporary ``<code>`` elements in the text to be escaped. The following keyword arguments are understood and passed to Pygments if found: * ``linenos`` The removal, separate highlighting and re-insertion of the ``<code>`` elements is necessary because Markdown and SmartyPants do not reliably avoid formatting text inside these elements; removing them before applying Markdown and typogrify means they are in no danger of having extraneous HTML or fancy punctuation inserted by mistake. Original implementation by user 'blinks' as snippet #119 at djangosnippets: http://www.djangosnippets.org/snippets/119/. This version makes the following changes: * The name of the function is now ``typygmentdown``. * The argument signature has changed to work better with the ``template_utils`` formatter. * The ``extensions`` and ``linenos`` arguments are looked for and passed to Markdown and Pygments, respectively. * The function is registered with the ``template_utils`` formatter. """ #soup = BeautifulSoup(unicode(text)) #code_blocks = soup.findAll(u'code') #for block in code_blocks: # block.replaceWith(u'<code class="removed"></code>') text2 = unicode(text) code_blocks = [] converted = [] start = 0 end = len(text2) # loop over every tag reg = re.compile(u'<code.*?code>', re.X | re.DOTALL) for m in reg.finditer(text2): # ignore comments #if not re.match(r'^!--(.*)--',tag): block = m.group(0) start_block = m.start() end_block = m.end() code_blocks.append(block) if start < start_block: converted.append(text2[start:(start_block-1)]) converted.append(u'<code class="removed"></code>') start = end_block + 1 if start < end: converted.append(text2[start:end]) htmlized = typogrify(markdown(u''.join(converted), extensions=kwargs.get('extensions', []))) ##typogrify() soup = ICantBelieveItsBeautifulSoup(htmlized) empty_code_blocks, index = soup.findAll('code', 'removed'), 0 formatter = HtmlFormatter(cssclass='typygmentdown', linenos=kwargs.get('linenos', False)) for block in code_blocks: #import pdb #pdb.set_trace() m = re.search(u"class=\"(?P<class_name>\w+)\">(?P<block_content>.*)</code>", block, re.DOTALL) block_content = u'' if m: block_group = m.groupdict() block_content = block_group['block_content'] language = block_group['class_name'] else: language = 'text' try: lexer = get_lexer_by_name(language, stripnl=True, encoding='UTF-8') except ValueError, e: try: lexer = guess_lexer(block.renderContents()) except ValueError, e: lexer = get_lexer_by_name('text', stripnl=True, encoding='UTF-8')